acs <- read.table("http://jaredlander.com/data/acs_ny.csv",sep=",",
                  header=TRUE, stringsAsFactors=TRUE)

Load the necessary libraries

library(ggplot2)
library(useful)
library(caret)
## Loading required package: lattice
library(ISLR)
library(scales)
library(plyr)
library(rpart)
library(rpart.plot)
library(class)
library(MuMIn)
library(caret)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:MuMIn':
## 
##     importance
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:scales':
## 
##     alpha
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(nlme)
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
## 
##     lmList
library(mgcv)
## This is mgcv 1.8-11. For overview type 'help("mgcv-package")'.

Let’s assume our goal is to build a model to predict if household income is greater than $250,000 per year.

Data preparation

Start by building a binary response variable.

acs$HighIncome <- as.numeric(with(acs, FamilyIncome >= 250000))
head(acs)
##   Acres FamilyIncome  FamilyType NumBedrooms NumChildren NumPeople
## 1  1-10          150     Married           4           1         3
## 2  1-10          180 Female Head           3           2         4
## 3  1-10          280 Female Head           4           0         2
## 4  1-10          330 Female Head           2           1         2
## 5  1-10          330   Male Head           3           1         2
## 6  1-10          480   Male Head           0           3         4
##   NumRooms        NumUnits NumVehicles NumWorkers  OwnRent   YearBuilt
## 1        9 Single detached           1          0 Mortgage   1950-1959
## 2        6 Single detached           2          0   Rented Before 1939
## 3        8 Single detached           3          1 Mortgage   2000-2004
## 4        4 Single detached           1          0   Rented   1950-1959
## 5        5 Single attached           1          0 Mortgage Before 1939
## 6        1 Single detached           0          0   Rented Before 1939
##   HouseCosts ElectricBill FoodStamp HeatingFuel Insurance       Language
## 1       1800           90        No         Gas      2500        English
## 2        850           90        No         Oil         0        English
## 3       2600          260        No         Oil      6600 Other European
## 4       1800          140        No         Oil         0        English
## 5        860          150        No         Gas       660        Spanish
## 6        700          140        No         Gas         0        English
##   HighIncome
## 1          0
## 2          0
## 3          0
## 4          0
## 5          0
## 6          0
tail(acs)
##       Acres FamilyIncome FamilyType NumBedrooms NumChildren NumPeople
## 22740   10+       556250    Married           4           0         2
## 22741   10+       565000    Married           5           3         5
## 22742   10+       599000    Married           4           0         2
## 22743   10+       611700    Married           4           1         5
## 22744   10+       621430    Married           3           2         4
## 22745   10+       751200    Married           4           2         4
##       NumRooms        NumUnits NumVehicles NumWorkers  OwnRent   YearBuilt
## 22740       10 Single detached           2          1 Mortgage Before 1939
## 22741       10 Single detached           2          2 Mortgage   1990-1999
## 22742        6 Single detached           2          2 Mortgage Before 1939
## 22743        9 Single detached           5          3 Mortgage Before 1939
## 22744       11 Single detached           2          3 Mortgage   1970-1979
## 22745       10 Single detached           2          2 Mortgage   1940-1949
##       HouseCosts ElectricBill FoodStamp HeatingFuel Insurance Language
## 22740       1800          380        No         Oil      1700  English
## 22741       1700          370        No         Gas      1000  English
## 22742       1300          100        No         Gas      3500  English
## 22743        410          100        No         Oil      1300  Spanish
## 22744       1600           80        No         Gas       800  Spanish
## 22745       6500          400        No         Oil      2000  English
##       HighIncome
## 22740          1
## 22741          1
## 22742          1
## 22743          1
## 22744          1
## 22745          1

Before splitting data, add/modify some variables for possible use in models

  1. Food Stamp to binary integer for linear regression
  2. Own home to binary integer for linear regression
  3. Family type to numerical for linear regression later
acs$foodstamp_binary <- ifelse(acs$FoodStamp == "Yes",1,0) # (yes = 1, no = 0)

acs$own_home <- ifelse(acs$OwnRent == "Rented",0, ifelse(acs$FamilyIncome == "Mortgage",1,2)) # (own = 1, rent = 0)

acs$family_type_cat <- ifelse(acs$FamilyType == "Married",1, 
                              ifelse(acs$FamilyIncome == "Female Head",2,3)) # married = 1, male head = 2, female head = 3

Based on groupby and plots (completed later) create new variables for potential use

acs$InsuranceHigh <- (acs$Insurance > 1000) * 1
acs$NumWorkers2 <- (acs$NumWorkers == 2) * 1
acs$HouseCostsHigh <- (acs$HouseCosts > 1000) * 1
acs$high_electric <- (acs$ElectricBill > 350) * 1

Break it into a training and test set with an 80/20 split.

set.seed(447)
testrecs <- sample(nrow(acs),0.2 * nrow(acs))
acs_test <- acs[testrecs,]
acs_fit <- acs[-testrecs,]  

Create binary variable where 1 = not on food stamps & not renting & married

acs$HI_pred1 <- 0
acs$HI_pred1[acs_test$FoodStamp == 'No' & acs_test$OwnRent != 'Rented' & acs_test$FamilyType == 'Married'] <- 1
# Nice visualization for a quick visual of what you're dealing with before digging in
ggplot(acs,aes(x=FamilyIncome)) + geom_density(fill="#31a354", color="#31a354") +
  geom_vline(xintercept=250000) + scale_x_continuous(label=multiple.dollar, limits=c(0,1000000))
## Warning: Removed 13 rows containing non-finite values (stat_density).

Interesting test I found for testing data normality

Downside is it has a 5000 observation limit. If p-value is less than .05 (or chosen significance level) then sample is NOT normally distributed. This is not relevant here, but worth keeping for future. The plot shows that there is a clear left skewned distribution - expected but still nice to include

shapiro.test(acs_test$FamilyIncome)
## 
##  Shapiro-Wilk normality test
## 
## data:  acs_test$FamilyIncome
## W = 0.71538, p-value < 2.2e-16

Preliminary EDA and feature engineering

Before trying to build any classification models, do some exploratory data analysis to try to get a sense of which variables might be useful for trying to predict cases where FamilyIncome >= 250000. You should use a combination of group by analysis (i.e. plyr or dplyr or similar) and plotting.

# Get some summary stats on each variable
summary(acs_fit)
##    Acres        FamilyIncome           FamilyType     NumBedrooms   
##  10+  :  815   Min.   :     50   Female Head: 2594   Min.   :0.000  
##  1-10 : 3709   1st Qu.:  53000   Male Head  :  907   1st Qu.:3.000  
##  Sub 1:13672   Median :  87100   Married    :14695   Median :3.000  
##                Mean   : 110726                       Mean   :3.384  
##                3rd Qu.: 134082                       3rd Qu.:4.000  
##                Max.   :1605000                       Max.   :8.000  
##                                                                     
##   NumChildren        NumPeople       NumRooms                NumUnits    
##  Min.   : 0.0000   Min.   : 2.0   Min.   : 1.00   Mobile home    :  583  
##  1st Qu.: 0.0000   1st Qu.: 2.0   1st Qu.: 6.00   Single attached: 1916  
##  Median : 0.0000   Median : 3.0   Median : 7.00   Single detached:15697  
##  Mean   : 0.9013   Mean   : 3.4   Mean   : 7.18                          
##  3rd Qu.: 2.0000   3rd Qu.: 4.0   3rd Qu.: 8.00                          
##  Max.   :12.0000   Max.   :18.0   Max.   :21.00                          
##                                                                          
##   NumVehicles      NumWorkers        OwnRent            YearBuilt   
##  Min.   :0.000   Min.   :0.000   Mortgage:16074   Before 1939:4874  
##  1st Qu.:2.000   1st Qu.:1.000   Outright:  128   1950-1959  :3107  
##  Median :2.000   Median :2.000   Rented  : 1994   1960-1969  :2201  
##  Mean   :2.119   Mean   :1.748                    1970-1979  :1894  
##  3rd Qu.:3.000   3rd Qu.:2.000                    1990-1999  :1647  
##  Max.   :6.000   Max.   :3.000                    1980-1989  :1598  
##                                                   (Other)    :2875  
##    HouseCosts    ElectricBill   FoodStamp        HeatingFuel   
##  Min.   :   4   Min.   :  1.0   No :16777   Gas        :10899  
##  1st Qu.: 650   1st Qu.:100.0   Yes: 1419   Oil        : 5187  
##  Median :1200   Median :150.0               Wood       :  986  
##  Mean   :1480   Mean   :174.4               Electricity:  831  
##  3rd Qu.:2000   3rd Qu.:220.0               Other      :  148  
##  Max.   :7090   Max.   :580.0               Coal       :  124  
##                                             (Other)    :   21  
##    Insurance                Language       HighIncome     
##  Min.   :   0.0   Asian Pacific :  639   Min.   :0.00000  
##  1st Qu.: 400.0   English       :14262   1st Qu.:0.00000  
##  Median : 720.0   Other         :  225   Median :0.00000  
##  Mean   : 958.9   Other European: 1640   Mean   :0.06139  
##  3rd Qu.:1200.0   Spanish       : 1430   3rd Qu.:0.00000  
##  Max.   :6600.0                          Max.   :1.00000  
##                                                           
##  foodstamp_binary     own_home     family_type_cat InsuranceHigh  
##  Min.   :0.00000   Min.   :0.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:0.000  
##  Median :0.00000   Median :2.000   Median :1.000   Median :0.000  
##  Mean   :0.07798   Mean   :1.781   Mean   :1.385   Mean   :0.326  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :1.00000   Max.   :2.000   Max.   :3.000   Max.   :1.000  
##                                                                   
##   NumWorkers2     HouseCostsHigh   high_electric    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :1.0000   Median :0.00000  
##  Mean   :0.4768   Mean   :0.5433   Mean   :0.06666  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
## 

Histogram

# see that those that those that own home correlate with higher incomes overall
ggplot(acs_fit) + geom_histogram(aes(x=own_home), fill = "gray")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatterplots

# scatter number of workers and family income
ggplot(data=acs_fit) + geom_point(aes(x=NumWorkers, y=FamilyIncome))

# scatter plot shows that those not on foodstamps tend to have higher income = duh, but relevant for model later
ggplot(data=acs_fit) + geom_point(aes(x=foodstamp_binary, y=FamilyIncome))

# plot shows that homes with 2 workers correlate with higher incomes vs other number of workers
ggplot(data=acs_fit) + geom_point(aes(x=NumWorkers, y=FamilyIncome))

# notice that there are very few observations with male head type. Female head has lower income and married highest incomes
ggplot(data=acs_fit) + geom_point(aes(x=family_type_cat, y=FamilyIncome))

# scatter house costs and family income - see that higher house costs correlate to higher incomes (slightly) - nothin major though
ggplot(data=acs_fit) + geom_point(aes(x=HouseCosts, y=FamilyIncome))

# create matrix of scatterplots
##pairs(acs[,1:19])

Boxplot

coor_cartesian -> Setting limits on the coordinate system will zoom the plot (like you’re looking at it with a magnifying glass), and will not change the underlying data like setting limits on a scale will.

# See that outliers begin roughly around income of $100,000
ggplot(data=acs_fit) + geom_boxplot(aes(x=NumWorkers, y=FamilyIncome))  + coord_cartesian(ylim = c(0, 350000))
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

Density Plots

These show the density by variable on axis. These are useful to see the concentration range of values

ggplot(acs_fit) + geom_density(aes(x=acs_fit$FamilyIncome)) + scale_x_continuous(labels=dollar)

ggplot(acs_fit) + geom_density(aes(x=acs_fit$HouseCosts)) + scale_x_continuous(labels=dollar)

ggplot(acs_fit) + geom_density(aes(x=acs_fit$NumChildren)) + scale_x_continuous()

ggplot(acs_fit) + geom_density(aes(x=acs_fit$FamilyIncome)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")

ggplot(acs_fit) + geom_density(aes(x=acs_fit$HouseCosts)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")

Misc Plots

# shows positive correlation between insurance and family income
ggplot(acs_fit, aes(x=acs_fit$Insurance, y=acs_fit$FamilyIncome)) +geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

# density plot for electrical bill
ggplot(acs_fit) + geom_density(aes(x=acs_fit$ElectricBill)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")

# shows positive correlation between electric bill and family income
ggplot(acs_fit, aes(x=acs_fit$ElectricBill, y=acs_fit$FamilyIncome)) +geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'

Group by analysis

# This shows a good spread or range of each family type group. This will lend itself well to being included in my analysis
ddply(acs_fit,.(FamilyType),summarise,family_type_count=length(FamilyIncome))
##    FamilyType family_type_count
## 1 Female Head              2594
## 2   Male Head               907
## 3     Married             14695
# Interesting look at mean income of family type grouped with home ownership type
ddply(acs_fit,.(FamilyType,OwnRent), summarise, mean_income=mean(FamilyIncome))
##    FamilyType  OwnRent mean_income
## 1 Female Head Mortgage    69983.15
## 2 Female Head Outright    92303.57
## 3 Female Head   Rented    34258.14
## 4   Male Head Mortgage    79298.16
## 5   Male Head Outright   213250.00
## 6   Male Head   Rented    47620.92
## 7     Married Mortgage   126152.48
## 8     Married Outright   110020.36
## 9     Married   Rented    71331.00
ddply(acs_fit,.(FamilyType,FoodStamp), summarise, mean_income=mean(FamilyIncome))
##    FamilyType FoodStamp mean_income
## 1 Female Head        No    67144.55
## 2 Female Head       Yes    37105.47
## 3   Male Head        No    78679.59
## 4   Male Head       Yes    45825.75
## 5     Married        No   124832.07
## 6     Married       Yes    61612.91
# simple look at mean income by foodstamp. Obvious results, but at the same time surprising to find that mean income for those on food stamps in near $50k
ddply(acs_fit,.(FoodStamp), summarise, mean_income=mean(FamilyIncome))
##   FoodStamp mean_income
## 1        No   115898.77
## 2       Yes    49572.72
ddply(acs_fit,.(FoodStamp,NumBedrooms), summarise, mean_income=mean(FamilyIncome), num_bedrooms=mean(NumBedrooms))
##    FoodStamp NumBedrooms mean_income num_bedrooms
## 1         No           0    91092.92            0
## 2         No           1    68606.42            1
## 3         No           2    80216.78            2
## 4         No           3   102048.19            3
## 5         No           4   137687.35            4
## 6         No           5   173357.12            5
## 7         No           8   182832.27            8
## 8        Yes           0    36577.50            0
## 9        Yes           1    22038.06            1
## 10       Yes           2    33118.22            2
## 11       Yes           3    45066.36            3
## 12       Yes           4    62828.82            4
## 13       Yes           5    60499.15            5
## 14       Yes           8    82773.62            8
# This is a little excessive, but would be useful for piping it to a csv file (for example) if relevant to tasks at hand
##ddply(acs,.(NumBedrooms,NumChildren,NumPeople,NumRooms,NumUnits,NumVehicles,NumWorkers), summarise, mean_income=mean(FamilyIncome))
ddply(acs_fit,.(OwnRent), summarise, mean_income=mean(FamilyIncome))
##    OwnRent mean_income
## 1 Mortgage   117551.80
## 2 Outright   109695.55
## 3   Rented    55771.63

Count (family income) by various important indicators/variables

# Family Type
tapply(acs_fit$FamilyIncome,acs_fit$FamilyType,length)
## Female Head   Male Head     Married 
##        2594         907       14695
tapply(acs_fit$FamilyIncome,acs_fit$FamilyType,mean)
## Female Head   Male Head     Married 
##    60242.74    72992.65   121966.88
# Own/Rent
tapply(acs_fit$FamilyIncome,acs_fit$OwnRent,length)
## Mortgage Outright   Rented 
##    16074      128     1994
tapply(acs_fit$FamilyIncome,acs_fit$OwnRent,mean)
##  Mortgage  Outright    Rented 
## 117551.80 109695.55  55771.63
# Insurance
tapply(acs_fit$FamilyIncome,acs_fit$FoodStamp,length)
##    No   Yes 
## 16777  1419
tapply(acs_fit$FamilyIncome,acs_fit$FoodStamp,mean)
##        No       Yes 
## 115898.77  49572.72

Model Specification

Start by building a null model in which you simply predict that everyone’s income is < 250000 (since the majority of incomes are less than 250000).

Null Model for model comparison

acs$null_model <- 0

Create a confusion matrix table and compute the overall accuracy of this model as well as its sensitivity and specificity.

table(acs$HighIncome, acs$null_model)
##    
##         0
##   0 21356
##   1  1389
prop.table(table(acs$HighIncome, acs$null_model))
##    
##              0
##   0 0.93893163
##   1 0.06106837
confusionMatrix(as.factor(acs$null_model), as.factor(acs$HighIncome), positive = "1")
## Warning in confusionMatrix.default(as.factor(acs$null_model), as.factor(acs
## $HighIncome), : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 21356  1389
##          1     0     0
##                                          
##                Accuracy : 0.9389         
##                  95% CI : (0.9357, 0.942)
##     No Information Rate : 0.9389         
##     P-Value [Acc > NIR] : 0.5071         
##                                          
##                   Kappa : 0              
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.00000        
##             Specificity : 1.00000        
##          Pos Pred Value :     NaN        
##          Neg Pred Value : 0.93893        
##              Prevalence : 0.06107        
##          Detection Rate : 0.00000        
##    Detection Prevalence : 0.00000        
##       Balanced Accuracy : 0.50000        
##                                          
##        'Positive' Class : 1              
## 

Logistic Regression

  1. Specify the model
  2. Show summary results
  3. Predict using model
  4. Set binomial variable equal to predictions with criteria > .5
  5. Set variable = AIC
  6. Confusion matrix
  7. Display confusion matrix

logistic regression model 1

logmod1 <- glm(HighIncome ~ FamilyType + NumVehicles + OwnRent + Insurance + YearBuilt, data=acs_fit, 
               family=binomial(link="logit"))

summary(logmod1)
## 
## Call:
## glm(formula = HighIncome ~ FamilyType + NumVehicles + OwnRent + 
##     Insurance + YearBuilt, family = binomial(link = "logit"), 
##     data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9844  -0.3540  -0.2875  -0.1979   3.3452  
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -1.606e+01  1.539e+02  -0.104    0.917    
## FamilyTypeMale Head   6.722e-01  3.116e-01   2.157    0.031 *  
## FamilyTypeMarried     1.674e+00  2.035e-01   8.225  < 2e-16 ***
## NumVehicles           2.463e-01  3.374e-02   7.300 2.88e-13 ***
## OwnRentOutright       7.038e-01  3.182e-01   2.212    0.027 *  
## OwnRentRented        -1.159e-01  1.937e-01  -0.598    0.550    
## Insurance             6.605e-04  2.212e-05  29.855  < 2e-16 ***
## YearBuilt1940-1949    9.780e+00  1.539e+02   0.064    0.949    
## YearBuilt1950-1959    1.034e+01  1.539e+02   0.067    0.946    
## YearBuilt1960-1969    1.036e+01  1.539e+02   0.067    0.946    
## YearBuilt1970-1979    1.015e+01  1.539e+02   0.066    0.947    
## YearBuilt1980-1989    1.049e+01  1.539e+02   0.068    0.946    
## YearBuilt1990-1999    1.050e+01  1.539e+02   0.068    0.946    
## YearBuilt2000-2004    1.061e+01  1.539e+02   0.069    0.945    
## YearBuilt2005         1.071e+01  1.539e+02   0.070    0.945    
## YearBuilt2006         1.105e+01  1.539e+02   0.072    0.943    
## YearBuilt2007         1.074e+01  1.539e+02   0.070    0.944    
## YearBuilt2008         1.080e+01  1.539e+02   0.070    0.944    
## YearBuilt2009         1.046e+01  1.539e+02   0.068    0.946    
## YearBuilt2010         1.135e+01  1.539e+02   0.074    0.941    
## YearBuiltBefore 1939  1.033e+01  1.539e+02   0.067    0.946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 7076.0  on 18175  degrees of freedom
## AIC: 7118
## 
## Number of Fisher Scoring iterations: 12
acs_test$yhat_logmod1 <- predict(logmod1, newdata=acs_test, type='response')

acs_test$yhat_logmod1 <- (acs_test$yhat_logmod1 > 0.05) * 1

log_mod1_aic <- summary(logmod1)$aic

log_cm1 <- confusionMatrix(as.factor(acs_test$yhat_logmod1), as.factor(acs_test$HighIncome), positive = "1")

log_cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2662   57
##          1 1615  215
##                                           
##                Accuracy : 0.6324          
##                  95% CI : (0.6182, 0.6465)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1121          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.79044         
##             Specificity : 0.62240         
##          Pos Pred Value : 0.11749         
##          Neg Pred Value : 0.97904         
##              Prevalence : 0.05979         
##          Detection Rate : 0.04726         
##    Detection Prevalence : 0.40229         
##       Balanced Accuracy : 0.70642         
##                                           
##        'Positive' Class : 1               
## 

logistic regression model 2

logmod2 <- glm(HighIncome ~ FamilyType + FoodStamp + OwnRent, data=acs_fit, family=binomial(link="logit"))

summary(logmod2)
## 
## Call:
## glm(formula = HighIncome ~ FamilyType + FoodStamp + OwnRent, 
##     family = binomial(link = "logit"), data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4992  -0.4048  -0.4048  -0.1728   3.7799  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -4.1971     0.1947 -21.557  < 2e-16 ***
## FamilyTypeMale Head   0.6399     0.3026   2.114   0.0345 *  
## FamilyTypeMarried     1.7363     0.1968   8.821  < 2e-16 ***
## FoodStampYes         -1.9012     0.3578  -5.314 1.08e-07 ***
## OwnRentOutright       0.4413     0.2964   1.489   0.1365    
## OwnRentRented        -1.0447     0.1885  -5.541 3.00e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 8037.1  on 18190  degrees of freedom
## AIC: 8049.1
## 
## Number of Fisher Scoring iterations: 7
acs_test$yhat_logmod2 <- predict(logmod2, newdata=acs_test, type='response')

acs_test$yhat_logmod2 <- (acs_test$yhat_logmod2 > 0.05) * 1

log_mod2_aic <- summary(logmod2)$aic

log_cm2 <- confusionMatrix(as.factor(acs_test$yhat_logmod2), as.factor(acs_test$HighIncome), positive = "1")

log_cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1265   28
##          1 3012  244
##                                          
##                Accuracy : 0.3317         
##                  95% CI : (0.318, 0.3456)
##     No Information Rate : 0.9402         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0314         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.89706        
##             Specificity : 0.29577        
##          Pos Pred Value : 0.07494        
##          Neg Pred Value : 0.97834        
##              Prevalence : 0.05979        
##          Detection Rate : 0.05364        
##    Detection Prevalence : 0.71576        
##       Balanced Accuracy : 0.59641        
##                                          
##        'Positive' Class : 1              
## 

logistic regression model 3

logmod3 <- glm(HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh + FoodStamp + OwnRent, 
               data=acs_fit, family=binomial(link="logit"))

summary(logmod3)
## 
## Call:
## glm(formula = HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh + 
##     FoodStamp + OwnRent, family = binomial(link = "logit"), data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2253  -0.3183  -0.2682  -0.1640   3.6467  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.30241    0.09797 -43.915  < 2e-16 ***
## InsuranceHigh    1.34537    0.07504  17.929  < 2e-16 ***
## NumWorkers2      0.07590    0.06409   1.184   0.2363    
## HouseCostsHigh   1.26356    0.09694  13.035  < 2e-16 ***
## FoodStampYes    -2.07741    0.35866  -5.792 6.95e-09 ***
## OwnRentOutright  1.72956    0.31359   5.515 3.48e-08 ***
## OwnRentRented   -0.34415    0.19554  -1.760   0.0784 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 7305.5  on 18189  degrees of freedom
## AIC: 7319.5
## 
## Number of Fisher Scoring iterations: 7
acs_test$yhat_logmod3 <- predict(logmod3, newdata=acs_test, type='response')

acs_test$yhat_logmod3 <- (acs_test$yhat_logmod3 > 0.05) * 1

log_mod3_aic <- summary(logmod3)$aic

log_cm3 <- confusionMatrix(as.factor(acs_test$yhat_logmod3), as.factor(acs_test$HighIncome), positive = "1")

log_cm3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3090   72
##          1 1187  200
##                                         
##                Accuracy : 0.7232        
##                  95% CI : (0.71, 0.7362)
##     No Information Rate : 0.9402        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.1568        
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.73529       
##             Specificity : 0.72247       
##          Pos Pred Value : 0.14420       
##          Neg Pred Value : 0.97723       
##              Prevalence : 0.05979       
##          Detection Rate : 0.04397       
##    Detection Prevalence : 0.30490       
##       Balanced Accuracy : 0.72888       
##                                         
##        'Positive' Class : 1             
## 

logistic regression model 4

logmod4 <- glm(HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh, data=acs_fit, family=binomial(link="logit"))

summary(logmod4)
## 
## Call:
## glm(formula = HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh, 
##     family = binomial(link = "logit"), data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5958  -0.3163  -0.2879  -0.1577   2.9642  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -4.38071    0.09202 -47.604   <2e-16 ***
## InsuranceHigh   1.41041    0.07252  19.447   <2e-16 ***
## NumWorkers2     0.11334    0.06374   1.778   0.0753 .  
## HouseCostsHigh  1.21827    0.09407  12.950   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 7404.8  on 18192  degrees of freedom
## AIC: 7412.8
## 
## Number of Fisher Scoring iterations: 6
acs_test$yhat_logmod4 <- predict(logmod4, newdata=acs_test, type='response')

acs_test$yhat_logmod4 <- (acs_test$yhat_logmod4 > 0.05) * 1

log_mod4_aic <- summary(logmod4)$aic

log_cm4 <- confusionMatrix(as.factor(acs_test$yhat_logmod4), as.factor(acs_test$HighIncome), positive = "1")

log_cm4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3072   73
##          1 1205  199
##                                           
##                Accuracy : 0.7191          
##                  95% CI : (0.7058, 0.7321)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1526          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.73162         
##             Specificity : 0.71826         
##          Pos Pred Value : 0.14174         
##          Neg Pred Value : 0.97679         
##              Prevalence : 0.05979         
##          Detection Rate : 0.04375         
##    Detection Prevalence : 0.30864         
##       Balanced Accuracy : 0.72494         
##                                           
##        'Positive' Class : 1               
## 

logistic regression model 5

logmod5 <- glm(HighIncome ~ FamilyType + NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + NumVehicles + 
                 NumWorkers + OwnRent + HouseCosts + ElectricBill + FoodStamp + Insurance + Language + 
                 InsuranceHigh + NumWorkers2 + HouseCostsHigh, data=acs_fit, family=binomial(link="logit"))

summary(logmod5)
## 
## Call:
## glm(formula = HighIncome ~ FamilyType + NumBedrooms + NumChildren + 
##     NumPeople + NumRooms + NumUnits + NumVehicles + NumWorkers + 
##     OwnRent + HouseCosts + ElectricBill + FoodStamp + Insurance + 
##     Language + InsuranceHigh + NumWorkers2 + HouseCostsHigh, 
##     family = binomial(link = "logit"), data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5995  -0.3188  -0.2038  -0.1272   3.6788  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -2.020e+01  1.550e+02  -0.130 0.896356    
## FamilyTypeMale Head      5.779e-01  3.213e-01   1.798 0.072102 .  
## FamilyTypeMarried        1.436e+00  2.069e-01   6.937 4.00e-12 ***
## NumBedrooms              1.191e-01  3.507e-02   3.396 0.000685 ***
## NumChildren              1.853e-01  5.317e-02   3.486 0.000491 ***
## NumPeople               -2.690e-01  5.039e-02  -5.338 9.39e-08 ***
## NumRooms                 1.064e-01  1.426e-02   7.461 8.61e-14 ***
## NumUnitsSingle attached  1.320e+01  1.550e+02   0.085 0.932166    
## NumUnitsSingle detached  1.284e+01  1.550e+02   0.083 0.933970    
## NumVehicles              2.531e-01  4.244e-02   5.963 2.48e-09 ***
## NumWorkers               1.778e-01  5.742e-02   3.096 0.001964 ** 
## OwnRentOutright          1.930e+00  3.441e-01   5.609 2.04e-08 ***
## OwnRentRented            3.982e-01  2.026e-01   1.965 0.049373 *  
## HouseCosts               4.985e-04  2.945e-05  16.926  < 2e-16 ***
## ElectricBill             2.001e-03  2.980e-04   6.714 1.89e-11 ***
## FoodStampYes            -1.255e+00  3.727e-01  -3.367 0.000760 ***
## Insurance                2.365e-04  3.125e-05   7.568 3.79e-14 ***
## LanguageEnglish         -2.851e-01  1.629e-01  -1.750 0.080110 .  
## LanguageOther           -1.487e-01  2.899e-01  -0.513 0.607872    
## LanguageOther European  -1.955e-01  1.839e-01  -1.063 0.287918    
## LanguageSpanish         -6.078e-01  2.096e-01  -2.899 0.003739 ** 
## InsuranceHigh            4.681e-01  9.381e-02   4.990 6.05e-07 ***
## NumWorkers2              3.137e-02  7.781e-02   0.403 0.686814    
## HouseCostsHigh           1.797e-01  1.155e-01   1.555 0.119832    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 6140.7  on 18172  degrees of freedom
## AIC: 6188.7
## 
## Number of Fisher Scoring iterations: 16
acs_test$yhat_logmod5 <- predict(logmod5, newdata=acs_test, type='response')

acs_test$yhat_logmod5 <- (acs_test$yhat_logmod5 > 0.05) * 1

log_mod5_aic <- summary(logmod5)$aic

log_cm5 <- confusionMatrix(as.factor(acs_test$yhat_logmod5), as.factor(acs_test$HighIncome), positive = "1")

log_cm5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3145   50
##          1 1132  222
##                                           
##                Accuracy : 0.7402          
##                  95% CI : (0.7272, 0.7529)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1927          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.81618         
##             Specificity : 0.73533         
##          Pos Pred Value : 0.16396         
##          Neg Pred Value : 0.98435         
##              Prevalence : 0.05979         
##          Detection Rate : 0.04880         
##    Detection Prevalence : 0.29765         
##       Balanced Accuracy : 0.77575         
##                                           
##        'Positive' Class : 1               
## 

logistic regression model 6

logmod6 <- glm(HighIncome ~ FamilyType + NumBedrooms + NumChildren + OwnRent + 
              HouseCosts + ElectricBill + FoodStamp + InsuranceHigh, 
              data=acs_fit, family=binomial(link="logit"))

summary(logmod6)
## 
## Call:
## glm(formula = HighIncome ~ FamilyType + NumBedrooms + NumChildren + 
##     OwnRent + HouseCosts + ElectricBill + FoodStamp + InsuranceHigh, 
##     family = binomial(link = "logit"), data = acs_fit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1642  -0.3266  -0.2067  -0.1488   3.8673  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -7.125e+00  2.339e-01 -30.467  < 2e-16 ***
## FamilyTypeMale Head  5.904e-01  3.158e-01   1.869   0.0616 .  
## FamilyTypeMarried    1.624e+00  2.028e-01   8.006 1.19e-15 ***
## NumBedrooms          2.568e-01  2.700e-02   9.514  < 2e-16 ***
## NumChildren         -7.037e-02  2.962e-02  -2.376   0.0175 *  
## OwnRentOutright      1.963e+00  3.122e-01   6.287 3.24e-10 ***
## OwnRentRented        5.207e-02  1.983e-01   0.263   0.7929    
## HouseCosts           5.746e-04  2.467e-05  23.294  < 2e-16 ***
## ElectricBill         2.358e-03  2.826e-04   8.341  < 2e-16 ***
## FoodStampYes        -1.764e+00  3.640e-01  -4.846 1.26e-06 ***
## InsuranceHigh        7.961e-01  8.077e-02   9.856  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 6352.8  on 18185  degrees of freedom
## AIC: 6374.8
## 
## Number of Fisher Scoring iterations: 8
acs_test$yhat_logmod6 <- predict(logmod6, newdata=acs_test, type='response')

acs_test$yhat_logmod6 <- (acs_test$yhat_logmod6 > 0.05) * 1

log_mod6_aic <- summary(logmod6)$aic

log_cm6 <- confusionMatrix(as.factor(acs_test$yhat_logmod6), as.factor(acs_test$HighIncome), positive = "1")

log_cm6
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3090   55
##          1 1187  217
##                                           
##                Accuracy : 0.727           
##                  95% CI : (0.7138, 0.7399)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1764          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.79779         
##             Specificity : 0.72247         
##          Pos Pred Value : 0.15456         
##          Neg Pred Value : 0.98251         
##              Prevalence : 0.05979         
##          Detection Rate : 0.04770         
##    Detection Prevalence : 0.30864         
##       Balanced Accuracy : 0.76013         
##                                           
##        'Positive' Class : 1               
## 

Linear Regression

  1. Specify the model
  2. Show summary results
  3. Predict using model
  4. Set binomilal variable equal to predictions with criteria > 250000
  5. Set variable = r squared
  6. Set variable = AIC
  7. Confusion matrix
  8. Display confusion matrix

Linear regression model 1

linear_mod1 <- lm(FamilyIncome ~ FamilyType + FoodStamp + OwnRent + HouseCosts + Insurance + ElectricBill + 
                    NumRooms, data=acs_fit)

summary(linear_mod1)
## 
## Call:
## lm(formula = FamilyIncome ~ FamilyType + FoodStamp + OwnRent + 
##     HouseCosts + Insurance + ElectricBill + NumRooms, data = acs_fit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -372890  -39833   -9227   22628 1198390 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -2.550e+04  2.699e+03  -9.448  < 2e-16 ***
## FamilyTypeMale Head  8.939e+03  3.233e+03   2.765   0.0057 ** 
## FamilyTypeMarried    3.814e+04  1.866e+03  20.435  < 2e-16 ***
## FoodStampYes        -2.614e+04  2.485e+03 -10.519  < 2e-16 ***
## OwnRentOutright      4.074e+04  7.468e+03   5.455 4.95e-08 ***
## OwnRentRented       -1.391e+03  2.241e+03  -0.620   0.5350    
## HouseCosts           2.738e+01  6.451e-01  42.439  < 2e-16 ***
## Insurance            1.839e+01  7.637e-01  24.078  < 2e-16 ***
## ElectricBill         5.154e+01  6.313e+00   8.163 3.48e-16 ***
## NumRooms             5.537e+03  2.793e+02  19.823  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 83680 on 18186 degrees of freedom
## Multiple R-squared:  0.3138, Adjusted R-squared:  0.3135 
## F-statistic: 924.1 on 9 and 18186 DF,  p-value: < 2.2e-16
acs_test$lin_mod1_FamilyIncome <- predict(linear_mod1, newdata=acs_test)

acs_test$lin_mod1_HighIncome <- ifelse(acs_test$lin_mod1_FamilyIncome > 250000,1,0)

linear_mod1_rsq <- summary(linear_mod1)$r.sq

linear_mod1_aic <- AIC(linear_mod1)

linear_cm1 <- confusionMatrix(as.factor(acs_test$lin_mod1_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

linear_cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4230  206
##          1   47   66
##                                           
##                Accuracy : 0.9444          
##                  95% CI : (0.9373, 0.9509)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.123           
##                                           
##                   Kappa : 0.319           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.24265         
##             Specificity : 0.98901         
##          Pos Pred Value : 0.58407         
##          Neg Pred Value : 0.95356         
##              Prevalence : 0.05979         
##          Detection Rate : 0.01451         
##    Detection Prevalence : 0.02484         
##       Balanced Accuracy : 0.61583         
##                                           
##        'Positive' Class : 1               
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(linear_mod1,newdata=acs_test))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -399700 -138300 -101600 -110700  -76090   28880

Linear regression model 2

linear_mod2 <- lm(FamilyIncome ~ Insurance + HouseCosts + ElectricBill + NumWorkers + FamilyType + 
                    FoodStamp + OwnRent + NumBedrooms + NumChildren + NumRooms + NumPeople + 
                    NumVehicles + Language, data=acs_fit)

summary(linear_mod2)
## 
## Call:
## lm(formula = FamilyIncome ~ Insurance + HouseCosts + ElectricBill + 
##     NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms + 
##     NumChildren + NumRooms + NumPeople + NumVehicles + Language, 
##     data = acs_fit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -386370  -37621   -9512   20578 1182262 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -4.907e+04  4.482e+03 -10.947  < 2e-16 ***
## Insurance               1.891e+01  7.552e-01  25.047  < 2e-16 ***
## HouseCosts              2.791e+01  6.532e-01  42.723  < 2e-16 ***
## ElectricBill            4.849e+01  6.330e+00   7.659 1.96e-14 ***
## NumWorkers              1.865e+04  8.823e+02  21.136  < 2e-16 ***
## FamilyTypeMale Head     7.558e+03  3.179e+03   2.378 0.017433 *  
## FamilyTypeMarried       3.036e+04  1.875e+03  16.191  < 2e-16 ***
## FoodStampYes           -1.214e+04  2.548e+03  -4.766 1.89e-06 ***
## OwnRentOutright         5.753e+04  7.362e+03   7.814 5.83e-15 ***
## OwnRentRented           7.785e+03  2.241e+03   3.474 0.000513 ***
## NumBedrooms             2.337e+03  7.579e+02   3.083 0.002050 ** 
## NumChildren             3.963e+03  7.790e+02   5.087 3.67e-07 ***
## NumRooms                4.482e+03  3.381e+02  13.254  < 2e-16 ***
## NumPeople              -7.638e+03  7.218e+02 -10.582  < 2e-16 ***
## NumVehicles             7.058e+03  7.393e+02   9.547  < 2e-16 ***
## LanguageEnglish         3.073e+03  3.400e+03   0.904 0.366076    
## LanguageOther          -1.447e+03  6.383e+03  -0.227 0.820722    
## LanguageOther European -1.789e+03  3.850e+03  -0.465 0.642238    
## LanguageSpanish        -9.829e+03  3.931e+03  -2.500 0.012415 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 82010 on 18177 degrees of freedom
## Multiple R-squared:  0.3413, Adjusted R-squared:  0.3407 
## F-statistic: 523.3 on 18 and 18177 DF,  p-value: < 2.2e-16
acs_test$lin_mod2_FamilyIncome <- predict(linear_mod2, newdata=acs_test)

acs_test$lin_mod2_HighIncome <- ifelse(acs_test$lin_mod2_FamilyIncome > 250000,1,0)

linear_mod2_rsq <- summary(linear_mod2)$r.sq

linear_mod2_aic <- AIC(linear_mod2)

linear_cm2 <- confusionMatrix(as.factor(acs_test$lin_mod2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

linear_cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4227  207
##          1   50   65
##                                         
##                Accuracy : 0.9435        
##                  95% CI : (0.9364, 0.95)
##     No Information Rate : 0.9402        
##     P-Value [Acc > NIR] : 0.1827        
##                                         
##                   Kappa : 0.3114        
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.23897       
##             Specificity : 0.98831       
##          Pos Pred Value : 0.56522       
##          Neg Pred Value : 0.95332       
##              Prevalence : 0.05979       
##          Detection Rate : 0.01429       
##    Detection Prevalence : 0.02528       
##       Balanced Accuracy : 0.61364       
##                                         
##        'Positive' Class : 1             
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(linear_mod2,newdata=acs_test))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -437000 -139600 -103600 -110700  -74330   44130

Support Vector Machines

“Finds the line/plane/hyperplane that separates the two groups of data as much as possible, and then see which side the new data points land on.” click the link below to see rest of SVM explanation. They give two great examples in simple terms in this forum:

https://www.reddit.com/r/MachineLearning/comments/15zrpp/please_explain_support_vector_machines_svm_like_i/

SVM process

  1. Specify the model
  2. Predict using model
  3. Set binomial variable equal to predictions with probability greater than 50%
  4. Confusion matrix
  5. Display confusion matrix

SVM model 1

svm_mod1 <- ksvm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumVehicles + NumBedrooms + 
                NumWorkers + NumPeople + NumChildren + NumUnits + FoodStamp + YearBuilt + Language + HeatingFuel, 
                data=acs_fit, family=binomial(link="logit"))

acs_test$svm_HighIncome <- predict(svm_mod1, newdata=acs_test, type='response')

acs_test$svm_HighIncome <- (acs_test$svm_HighIncome > 0.5) * 1

svm_cm1 <- confusionMatrix(as.factor(acs_test$svm_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

svm_cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4259  231
##          1   18   41
##                                           
##                Accuracy : 0.9453          
##                  95% CI : (0.9383, 0.9517)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.07832         
##                                           
##                   Kappa : 0.2313          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.150735        
##             Specificity : 0.995791        
##          Pos Pred Value : 0.694915        
##          Neg Pred Value : 0.948552        
##              Prevalence : 0.059793        
##          Detection Rate : 0.009013        
##    Detection Prevalence : 0.012970        
##       Balanced Accuracy : 0.573263        
##                                           
##        'Positive' Class : 1               
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod1,newdata=acs_test))
##        V1           
##  Min.   :-1.091970  
##  1st Qu.:-0.020412  
##  Median :-0.015125  
##  Mean   : 0.030201  
##  3rd Qu.:-0.007717  
##  Max.   : 1.017718

SVM model 2

svm_mod2 <- ksvm(FamilyIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumVehicles + NumBedrooms, 
                 data=acs_fit)

acs_test$svm2_HighIncome <- predict(svm_mod2, newdata=acs_test)

acs_test$svm2_HighIncome <- ifelse(acs_test$svm2_HighIncome > 250000,1,0)

svm_cm2 <- confusionMatrix(as.factor(acs_test$svm2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

svm_cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4252  223
##          1   25   49
##                                           
##                Accuracy : 0.9455          
##                  95% CI : (0.9385, 0.9519)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.06934         
##                                           
##                   Kappa : 0.2644          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.18015         
##             Specificity : 0.99415         
##          Pos Pred Value : 0.66216         
##          Neg Pred Value : 0.95017         
##              Prevalence : 0.05979         
##          Detection Rate : 0.01077         
##    Detection Prevalence : 0.01627         
##       Balanced Accuracy : 0.58715         
##                                           
##        'Positive' Class : 1               
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod2,newdata=acs_test))
##        V1         
##  Min.   :-452624  
##  1st Qu.:-119155  
##  Median : -88490  
##  Mean   : -97007  
##  3rd Qu.: -66964  
##  Max.   :  -7565

SVM model 3

svm_mod3 <- ksvm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumRooms, data=acs_fit,
                 family=binomial(link="logit"))

acs_test$svm3_HighIncome <- predict(svm_mod3, newdata=acs_test, type='response')

acs_test$svm3_HighIncome <- (acs_test$svm3_HighIncome > 0.5) * 1

svm_cm3 <- confusionMatrix(as.factor(acs_test$svm3_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

svm_cm3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4258  224
##          1   19   48
##                                           
##                Accuracy : 0.9466          
##                  95% CI : (0.9396, 0.9529)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.03567         
##                                           
##                   Kappa : 0.2658          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.17647         
##             Specificity : 0.99556         
##          Pos Pred Value : 0.71642         
##          Neg Pred Value : 0.95002         
##              Prevalence : 0.05979         
##          Detection Rate : 0.01055         
##    Detection Prevalence : 0.01473         
##       Balanced Accuracy : 0.58601         
##                                           
##        'Positive' Class : 1               
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod3,newdata=acs_test))
##        V1          
##  Min.   :-1.01270  
##  1st Qu.:-0.02385  
##  Median :-0.02340  
##  Mean   : 0.02265  
##  3rd Qu.:-0.02083  
##  Max.   : 1.02540

Multilevel Model Specification

“A mixed model is similar in many ways to a linear model. It estimates the effects of one or more explanatory variables on a response variable. The output of a mixed model will give you a list of explanatory values, estimates and confidence intervals of their effect sizes, p-values for each effect, and at least one measure of how well the model fits. You should use a mixed model instead of a simple linear model when you have a variable that describes your data sample as a subset of the data you could have collected” Tufts.edu (see link for source of quote)

Check out this link for a very well explained tutorial with easy to understand explanation of mixed models:

http://ase.tufts.edu/gsc/gradresources/guidetomixedmodelsinr/mixed%20model%20guide.html

I would not use this model for this data set to specify th

  1. Specify model
  2. Model summary
  3. Predict
  4. Transform predictions to binomial > .5
  5. Confusion matrix
  6. Display confusion matrix
  7. Set variable = AIC
  8. Display AIC

MMS model using lme()

mms1 <- lme(HighIncome ~ Insurance + HouseCosts + ElectricBill + FoodStamp + 
            OwnRent, method = "ML", data = acs_fit, random =~ NumChildren | FamilyType)

summary(mms1)
## Linear mixed-effects model fit by maximum likelihood
##  Data: acs_fit 
##         AIC       BIC   logLik
##   -3398.435 -3312.537 1710.218
## 
## Random effects:
##  Formula: ~NumChildren | FamilyType
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev      Corr  
## (Intercept) 0.018021290 (Intr)
## NumChildren 0.002686936 -0.984
## Residual    0.220210011       
## 
## Fixed effects: HighIncome ~ Insurance + HouseCosts + ElectricBill + FoodStamp +      OwnRent 
##                       Value   Std.Error    DF    t-value p-value
## (Intercept)     -0.09421877 0.007771971 18187 -12.122892  0.0000
## Insurance        0.00004477 0.000002001 18187  22.375620  0.0000
## HouseCosts       0.00005145 0.000001701 18187  30.237485  0.0000
## ElectricBill     0.00012310 0.000016543 18187   7.441276  0.0000
## FoodStampYes    -0.01421120 0.006544939 18187  -2.171327  0.0299
## OwnRentOutright  0.11008631 0.019637856 18187   5.605821  0.0000
## OwnRentRented    0.03884131 0.005858341 18187   6.630086  0.0000
##  Correlation: 
##                 (Intr) Insrnc HsCsts ElctrB FdStmY OwnRnO
## Insurance       -0.031                                   
## HouseCosts      -0.223 -0.419                            
## ElectricBill    -0.294 -0.173 -0.221                     
## FoodStampYes    -0.128 -0.006  0.081 -0.036              
## OwnRentOutright -0.013 -0.011  0.081 -0.014  0.001       
## OwnRentRented   -0.174  0.290 -0.047 -0.005 -0.245  0.027
## 
## Standardized Within-Group Residuals:
##         Min          Q1         Med          Q3         Max 
## -2.79009543 -0.39900734 -0.11855030  0.05345452  4.86099519 
## 
## Number of Observations: 18196
## Number of Groups: 3
acs_test$mms1_HighIncome <- predict(mms1, newdata=acs_test, type='response')

acs_test$mms1_HighIncome <- (acs_test$mms1_HighIncome > 0.5) * 1

mms1_cm <- confusionMatrix(as.factor(acs_test$mms1_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
mms1_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4270  257
##          1    7   15
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9348, 0.9486)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.3221          
##                                           
##                   Kappa : 0.0939          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.055147        
##             Specificity : 0.998363        
##          Pos Pred Value : 0.681818        
##          Neg Pred Value : 0.943230        
##              Prevalence : 0.059793        
##          Detection Rate : 0.003297        
##    Detection Prevalence : 0.004836        
##       Balanced Accuracy : 0.526755        
##                                           
##        'Positive' Class : 1               
## 
mms1_aic <- AIC(mms1)
mms1_aic
## [1] -3398.435

MMS model using lmer()

mms2 <- lmer(HighIncome ~ 1 + FoodStamp + OwnRent + FamilyType + OwnRent + NumWorkers + (1 | FamilyType), data = acs_fit)

summary(mms2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: HighIncome ~ 1 + FoodStamp + OwnRent + FamilyType + OwnRent +  
##     NumWorkers + (1 | FamilyType)
##    Data: acs_fit
## 
## REML criterion at convergence: -480.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.5198 -0.3236 -0.2944 -0.1493  4.3136 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev.
##  FamilyType (Intercept) 0.0008017 0.02831 
##  Residual               0.0568581 0.23845 
## Number of obs: 18196, groups:  FamilyType, 3
## 
## Fixed effects:
##                      Estimate Std. Error t value
## (Intercept)          0.015036   0.028962   0.519
## FoodStampYes        -0.030157   0.007083  -4.258
## OwnRentOutright      0.039813   0.021262   1.873
## OwnRentRented       -0.027411   0.006054  -4.528
## FamilyTypeMale Head  0.006616   0.041088   0.161
## FamilyTypeMarried    0.048180   0.040400   1.193
## NumWorkers           0.006973   0.002218   3.144
## 
## Correlation of Fixed Effects:
##             (Intr) FdStmY OwnRnO OwnRnR FmlTMH FmlyTM
## FoodStampYs -0.050                                   
## OwnRntOtrgh -0.017  0.005                            
## OwnRentRntd -0.052 -0.253  0.034                     
## FmlyTypMlHd -0.693  0.007  0.002  0.007              
## FmlyTypMrrd -0.704  0.023 -0.002  0.020  0.497       
## NumWorkers  -0.115  0.082  0.097  0.076 -0.002 -0.020
acs_test$mms2_HighIncome <- predict(mms2, newdata=acs_test, type='response')
acs_test$mms2_HighIncome <- (acs_test$mms2_HighIncome > 0.5) * 1

mms2_cm <- confusionMatrix(as.factor(acs_test$mms2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
## Warning in confusionMatrix.default(as.factor(acs_test$mms2_HighIncome), :
## Levels are not in the same order for reference and data. Refactoring data
## to match.
mms2_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4277  272
##          1    0    0
##                                           
##                Accuracy : 0.9402          
##                  95% CI : (0.9329, 0.9469)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.5161          
##                                           
##                   Kappa : 0               
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.00000         
##             Specificity : 1.00000         
##          Pos Pred Value :     NaN         
##          Neg Pred Value : 0.94021         
##              Prevalence : 0.05979         
##          Detection Rate : 0.00000         
##    Detection Prevalence : 0.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : 1               
## 
mms2_aic <- AIC(mms2)
mms2_aic
## [1] -462.3482

Generalized Additive Models

Important considerations of GAM

GAM’s let you represent nonlinear and non-montonic relationships between variables and outcome in linear or logistic regression framework

Evaluate the GAM with same measures as you would for simple linear or logistic regression.

Here is a good link with detailed explnation & a good code example:

https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/gam.html

GAM model process

  1. Specify model
  2. Model summary
  3. Predict using model
  4. Use estimates to create binomial for High Income
  5. Set variable = r squared
  6. Set variable = AIC
  7. Confusion matrix
  8. Display confusion matrix
  9. Residual analysis

GAM 1 - Estimating family income & use those estimates to predict High Income

gam_mod1 <- gam(FamilyIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + NumWorkers + FamilyType + 
                  FoodStamp + OwnRent + NumBedrooms + s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles + 
                  Language, family=gaussian(link =identity), data=acs_fit)

summary(gam_mod1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## FamilyIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + 
##     NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms + 
##     s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles + 
##     Language
## 
## Parametric coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             34331.7     4910.4   6.992 2.81e-12 ***
## NumWorkers              19733.6      896.4  22.014  < 2e-16 ***
## FamilyTypeMale Head      7225.0     3155.6   2.290   0.0221 *  
## FamilyTypeMarried       29536.5     1869.6  15.798  < 2e-16 ***
## FoodStampYes           -12728.7     2538.3  -5.015 5.36e-07 ***
## OwnRentOutright         48514.6     7399.4   6.557 5.65e-11 ***
## OwnRentRented           -6545.7     3289.9  -1.990   0.0466 *  
## NumBedrooms              1270.0      776.0   1.636   0.1018    
## NumVehicles              6865.2      740.6   9.270  < 2e-16 ***
## LanguageEnglish          1527.9     3389.4   0.451   0.6521    
## LanguageOther           -1459.9     6347.0  -0.230   0.8181    
## LanguageOther European  -2123.9     3829.2  -0.555   0.5791    
## LanguageSpanish         -9877.2     3906.8  -2.528   0.0115 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df       F  p-value    
## s(Insurance)    7.729  8.458  74.504  < 2e-16 ***
## s(HouseCosts)   7.853  8.515 220.272  < 2e-16 ***
## s(ElectricBill) 1.000  1.000  59.394 1.36e-14 ***
## s(NumChildren)  3.825  4.639   9.296 4.13e-08 ***
## s(NumRooms)     4.031  4.910  43.928  < 2e-16 ***
## s(NumPeople)    2.342  2.974  34.244  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.351   Deviance explained = 35.2%
## GCV = 6.635e+09  Scale est. = 6.6205e+09  n = 18196
acs_test$gam_FamilyIncome <- predict(gam_mod1, newdata=acs_test)

acs_test$gam_HighIncome <- ifelse(acs_test$gam_FamilyIncome > 250000,1,0)

gam_mod1_rsq <- summary(gam_mod1)$r.sq

gam_mod_aic <- AIC(gam_mod1)

gam_cm1 <- confusionMatrix(as.factor(acs_test$gam_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

gam_cm1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4209  206
##          1   68   66
##                                           
##                Accuracy : 0.9398          
##                  95% CI : (0.9325, 0.9465)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.5656          
##                                           
##                   Kappa : 0.2974          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.24265         
##             Specificity : 0.98410         
##          Pos Pred Value : 0.49254         
##          Neg Pred Value : 0.95334         
##              Prevalence : 0.05979         
##          Detection Rate : 0.01451         
##    Detection Prevalence : 0.02946         
##       Balanced Accuracy : 0.61337         
##                                           
##        'Positive' Class : 1               
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(gam_mod1,newdata=acs_test))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -452200 -135800 -103300 -110700  -74820   50060

GAM 2 - Use logistic model with GAM to predict High Income

gam_mod2 <- gam(HighIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + NumWorkers + FamilyType + FoodStamp + 
                  OwnRent + NumBedrooms + s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles + Language, 
                data = acs_fit, family=binomial(link="logit"))

summary(gam_mod2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## HighIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + 
##     NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms + 
##     s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles + 
##     Language
## 
## Parametric coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            -5.43414    0.31324 -17.348  < 2e-16 ***
## NumWorkers              0.19617    0.05528   3.549 0.000387 ***
## FamilyTypeMale Head     0.58537    0.32006   1.829 0.067406 .  
## FamilyTypeMarried       1.37945    0.20733   6.653 2.87e-11 ***
## FoodStampYes           -1.24465    0.37080  -3.357 0.000789 ***
## OwnRentOutright         2.18886    0.34034   6.431 1.26e-10 ***
## OwnRentRented          -0.05986    0.25095  -0.239 0.811479    
## NumBedrooms             0.04701    0.03670   1.281 0.200161    
## NumVehicles             0.22197    0.04239   5.236 1.64e-07 ***
## LanguageEnglish        -0.33222    0.16546  -2.008 0.044649 *  
## LanguageOther          -0.04632    0.28944  -0.160 0.872848    
## LanguageOther European -0.21996    0.18646  -1.180 0.238115    
## LanguageSpanish        -0.62617    0.21181  -2.956 0.003114 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                   edf Ref.df Chi.sq  p-value    
## s(Insurance)    8.570  8.934 183.64  < 2e-16 ***
## s(HouseCosts)   6.577  7.603 376.29  < 2e-16 ***
## s(ElectricBill) 1.452  1.781  32.08 1.36e-07 ***
## s(NumChildren)  3.112  3.796  16.59  0.00263 ** 
## s(NumRooms)     2.774  3.384  93.22  < 2e-16 ***
## s(NumPeople)    1.022  1.044  28.57 1.22e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.213   Deviance explained = 28.5%
## UBRE = -0.66611  Scale est. = 1         n = 18196
acs_test$gam2_HighIncome <- predict(gam_mod2, newdata=acs_test)

acs_test$gam2_HighIncome <- ifelse(acs_test$gam2_HighIncome > .5 ,1,0)

gam_mod2_rsq <- summary(gam_mod2)$r.sq

gam_mod2_aic <- AIC(gam_mod2)

gam_cm2 <- confusionMatrix(as.factor(acs_test$gam2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")

gam_cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4259  239
##          1   18   33
##                                         
##                Accuracy : 0.9435        
##                  95% CI : (0.9364, 0.95)
##     No Information Rate : 0.9402        
##     P-Value [Acc > NIR] : 0.1827        
##                                         
##                   Kappa : 0.189         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.121324      
##             Specificity : 0.995791      
##          Pos Pred Value : 0.647059      
##          Neg Pred Value : 0.946865      
##              Prevalence : 0.059793      
##          Detection Rate : 0.007254      
##    Detection Prevalence : 0.011211      
##       Balanced Accuracy : 0.558557      
##                                         
##        'Positive' Class : 1             
## 
# Residual Analysis
summary(acs_test$HighIncome - predict(gam_mod2,newdata=acs_test))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.856   2.831   3.840   3.795   4.775   9.065

Plot GAM model 2 tosvm_cm1 visualize the s() effect

plot(gam_mod2)

MuMIn and the wonders of the dredge() function!

This package automates the model specification process to some degree. It does so by “dredging” through all possible combinations of independent variables. You can then display “best models” and importance metrics, which can then be used to specify your model.

This link is a very detailed example of using MuMIn and the dredge function. Scsprintf(“Logistic model 6: Predicted Accuracy = %.3f Predicted Sensitivity = %.3f AIC = %.1f”,

https://sites.google.com/site/rforfishandwildlifegrads/home/mumin_usage_examples

  1. Specify Model
  2. Dredge
  3. Model Summary
  4. Find best models
  5. Calculate importance weights

GLM dredge

# be aware that this takes time to run. It is worth the wait BUT frowned upon when used due to spurious results and inference
dredge_glm_mod <- glm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + FoodStamp + OwnRent + 
              NumBedrooms + NumRooms, family=binomial(logit), na.action = "na.fail", data=acs_fit)

dd_glm <- dredge(dredge_glm_mod)
## Fixed term is "(Intercept)"
summary(dredge_glm_mod)
## 
## Call:
## glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill + 
##     FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms, 
##     family = binomial(logit), data = acs_fit, na.action = "na.fail")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6709  -0.3183  -0.2205  -0.1516   3.8289  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -7.290e+00  2.415e-01 -30.179  < 2e-16 ***
## Insurance            3.115e-04  2.676e-05  11.637  < 2e-16 ***
## HouseCosts           5.372e-04  2.504e-05  21.455  < 2e-16 ***
## ElectricBill         2.099e-03  2.909e-04   7.215 5.39e-13 ***
## FamilyTypeMale Head  6.300e-01  3.220e-01   1.957  0.05036 .  
## FamilyTypeMarried    1.578e+00  2.064e-01   7.647 2.06e-14 ***
## FoodStampYes        -1.682e+00  3.695e-01  -4.552 5.32e-06 ***
## OwnRentOutright      1.926e+00  3.133e-01   6.148 7.83e-10 ***
## OwnRentRented        7.924e-02  1.965e-01   0.403  0.68683    
## NumBedrooms          8.726e-02  3.367e-02   2.591  0.00956 ** 
## NumRooms             1.131e-01  1.403e-02   8.060 7.62e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8398.1  on 18195  degrees of freedom
## Residual deviance: 6265.0  on 18185  degrees of freedom
## AIC: 6287
## 
## Number of Fisher Scoring iterations: 8
# best supported models
subset(dd_glm, delta < 5)
## Global model call: glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill + 
##     FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms, 
##     family = binomial(logit), data = acs_fit, na.action = "na.fail")
## ---
## Model selection table 
##     (Intrc)    ElctB FmlyT FdStm     HsCst     Insrn   NmBdr  NmRms OwnRn
## 256  -7.290 0.002099     +     + 0.0005372 0.0003114 0.08726 0.1131     +
## 224  -7.152 0.002180     +     + 0.0005430 0.0003188         0.1321     +
##     df    logLik   AICc delta weight
## 256 11 -3132.486 6287.0  0.00  0.908
## 224 10 -3135.771 6291.6  4.57  0.092
## Models ranked by AICc(x)
# best model
subset(dd_glm, delta == 0)
## Global model call: glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill + 
##     FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms, 
##     family = binomial(logit), data = acs_fit, na.action = "na.fail")
## ---
## Model selection table 
##     (Intrc)    ElctB FmlyT FdStm     HsCst     Insrn   NmBdr  NmRms OwnRn
## 256   -7.29 0.002099     +     + 0.0005372 0.0003114 0.08726 0.1131     +
##     df    logLik AICc delta weight
## 256 11 -3132.486 6287     0      1
## Models ranked by AICc(x)

LM dredge

dredge_lm_mod <- lm(HighIncome ~ FamilyType + HouseCosts + Insurance + NumRooms + ElectricBill + FoodStamp + 
                      OwnRent + NumBedrooms, data = acs_fit, na.action = na.fail)

dd_lm <- dredge(dredge_lm_mod)
## Fixed term is "(Intercept)"
summary(dredge_lm_mod)
## 
## Call:
## lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance + 
##     NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms, 
##     data = acs_fit, na.action = na.fail)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69682 -0.08760 -0.02763  0.01461  1.06314 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.760e-01  7.444e-03 -23.639  < 2e-16 ***
## FamilyTypeMale Head  8.939e-03  8.467e-03   1.056   0.2911    
## FamilyTypeMarried    3.353e-02  4.889e-03   6.858 7.19e-12 ***
## HouseCosts           4.880e-05  1.695e-06  28.791  < 2e-16 ***
## Insurance            4.178e-05  2.004e-06  20.845  < 2e-16 ***
## NumRooms             9.480e-03  8.948e-04  10.594  < 2e-16 ***
## ElectricBill         9.443e-05  1.662e-05   5.682 1.35e-08 ***
## FoodStampYes        -1.475e-02  6.523e-03  -2.261   0.0238 *  
## OwnRentOutright      1.269e-01  1.957e-02   6.481 9.31e-11 ***
## OwnRentRented        4.821e-02  5.871e-03   8.211 2.34e-16 ***
## NumBedrooms          2.370e-03  1.948e-03   1.217   0.2237    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2191 on 18185 degrees of freedom
## Multiple R-squared:  0.1671, Adjusted R-squared:  0.1667 
## F-statistic: 364.9 on 10 and 18185 DF,  p-value: < 2.2e-16
subset(dd_lm, delta < 5)
## Global model call: lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance + 
##     NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms, 
##     data = acs_fit, na.action = na.fail)
## ---
## Model selection table 
##     (Intrc)     ElctB FmlyT FdStm     HsCst     Insrn    NmBdr    NmRms
## 224 -0.1731 9.647e-05     +     + 4.896e-05 4.194e-05          0.010110
## 256 -0.1760 9.443e-05     +     + 4.880e-05 4.178e-05 0.002370 0.009480
## 220 -0.1756 9.499e-05     +       4.922e-05 4.193e-05          0.010110
## 252 -0.1782 9.317e-05     +       4.908e-05 4.179e-05 0.002055 0.009564
##     OwnRn df   logLik    AICc delta weight
## 224     + 11 1809.015 -3596.0  0.00  0.457
## 256     + 12 1809.756 -3595.5  0.52  0.352
## 220     + 10 1806.639 -3593.3  2.75  0.116
## 252     + 11 1807.199 -3592.4  3.63  0.074
## Models ranked by AICc(x)
subset(dd_lm, delta == 0)
## Global model call: lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance + 
##     NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms, 
##     data = acs_fit, na.action = na.fail)
## ---
## Model selection table 
##     (Intrc)     ElctB FmlyT FdStm     HsCst     Insrn   NmRms OwnRn df
## 224 -0.1731 9.647e-05     +     + 4.896e-05 4.194e-05 0.01011     + 11
##       logLik  AICc delta weight
## 224 1809.015 -3596     0      1
## Models ranked by AICc(x)

DECISION TREES

  1. Specify model
  2. use rplot with model as specified
  3. Show head of tree prediction probabilities (commented out)
  4. Show head of tree predictions as binary (commented out)
  5. Create variable as a Confusion matrix for tree
  6. Display confusion matrix
  7. Residual analysis (looking for mean near 0)

Decision tree 1

tree1 <- rpart(HighIncome ~ FamilyType + HouseCosts + NumWorkers2 + OwnRent + Insurance + NumWorkers2 + 
                 YearBuilt + NumBedrooms, data=acs_fit, method="class")

rpart.plot(tree1)

##head(predict(tree1))
##head(predict(tree1, type="class"))

tree1_cm <- confusionMatrix(predict(tree1, type="class"), acs_fit$HighIncome, positive = "1")
tree1_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16903   890
##          1   176   227
##                                           
##                Accuracy : 0.9414          
##                  95% CI : (0.9379, 0.9448)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.05864         
##                                           
##                   Kappa : 0.2751          
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.20322         
##             Specificity : 0.98969         
##          Pos Pred Value : 0.56328         
##          Neg Pred Value : 0.94998         
##              Prevalence : 0.06139         
##          Detection Rate : 0.01248         
##    Detection Prevalence : 0.02215         
##       Balanced Accuracy : 0.59646         
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(tree1,newdata=acs_test))
##        0                 1            
##  Min.   :-0.9580   Min.   :-0.563275  
##  1st Qu.:-0.9580   1st Qu.:-0.041991  
##  Median :-0.9580   Median :-0.041991  
##  Mean   :-0.8776   Mean   :-0.002771  
##  3rd Qu.:-0.9580   3rd Qu.:-0.041991  
##  Max.   : 0.5633   Max.   : 0.958009

Decision tree 2

tree2 <- rpart(HighIncome ~ FoodStamp + Insurance + FamilyType, data=acs_fit, method="class", 
               control=rpart.control(minsplit=2, cp=0))

rpart.plot(tree2)

##head(predict(tree2))
##head(predict(tree2, type="class"))

tree2_cm <- confusionMatrix(predict(tree2, type="class"), acs_fit$HighIncome, positive = "1")
tree2_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 17062  1086
##          1    17    31
##                                           
##                Accuracy : 0.9394          
##                  95% CI : (0.9358, 0.9428)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.3397          
##                                           
##                   Kappa : 0.0484          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.027753        
##             Specificity : 0.999005        
##          Pos Pred Value : 0.645833        
##          Neg Pred Value : 0.940159        
##              Prevalence : 0.061387        
##          Detection Rate : 0.001704        
##    Detection Prevalence : 0.002638        
##       Balanced Accuracy : 0.513379        
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(tree2,newdata=acs_test))
##        0                 1            
##  Min.   :-1.0000   Min.   :-1.000000  
##  1st Qu.:-0.9838   1st Qu.:-0.074370  
##  Median :-0.9572   Median :-0.022409  
##  Mean   :-0.8779   Mean   :-0.002539  
##  3rd Qu.:-0.9256   3rd Qu.:-0.016161  
##  Max.   : 0.5714   Max.   : 1.000000

Decision tree 3

tree3 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts, data=acs_fit, method="class", 
               control=rpart.control(minsplit=2, cp=.005))

rpart.plot(tree3)

##head(predict(tree3))
##head(predict(tree3, type="class"))

tree3_cm <- confusionMatrix(predict(tree3, type="class"), acs_fit$HighIncome, positive = "1")
tree3_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16991   957
##          1    88   160
##                                           
##                Accuracy : 0.9426          
##                  95% CI : (0.9391, 0.9459)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.013           
##                                           
##                   Kappa : 0.217           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.143241        
##             Specificity : 0.994847        
##          Pos Pred Value : 0.645161        
##          Neg Pred Value : 0.946679        
##              Prevalence : 0.061387        
##          Detection Rate : 0.008793        
##    Detection Prevalence : 0.013629        
##       Balanced Accuracy : 0.569044        
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(tree3,newdata=acs_test))
##        0                 1            
##  Min.   :-0.9580   Min.   :-0.740260  
##  1st Qu.:-0.9580   1st Qu.:-0.041991  
##  Median :-0.9580   Median :-0.041991  
##  Mean   :-0.8779   Mean   :-0.002525  
##  3rd Qu.:-0.9580   3rd Qu.:-0.041991  
##  Max.   : 0.7403   Max.   : 0.958009

Decision tree 4

tree4 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms + NumChildren + 
                 NumPeople + NumRooms + NumVehicles + NumWorkers + FoodStamp + OwnRent + ElectricBill + 
                 HouseCosts, data=acs_fit, method="class", control=rpart.control(minsplit=2, cp=0))

rpart.plot(tree4)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

##head(predict(tree4))
##head(predict(tree4, type="class"))

tree4_cm <- confusionMatrix(predict(tree4, type="class"), acs_fit$HighIncome, positive = "1")
tree4_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 17079     1
##          1     0  1116
##                                      
##                Accuracy : 0.9999     
##                  95% CI : (0.9997, 1)
##     No Information Rate : 0.9386     
##     P-Value [Acc > NIR] : <2e-16     
##                                      
##                   Kappa : 0.9995     
##  Mcnemar's Test P-Value : 1          
##                                      
##             Sensitivity : 0.99910    
##             Specificity : 1.00000    
##          Pos Pred Value : 1.00000    
##          Neg Pred Value : 0.99994    
##              Prevalence : 0.06139    
##          Detection Rate : 0.06133    
##    Detection Prevalence : 0.06133    
##       Balanced Accuracy : 0.99955    
##                                      
##        'Positive' Class : 1          
## 
# Residual analysis
summary(acs_test$HighIncome - predict(tree4,newdata=acs_test))
##        0                 1           
##  Min.   :-1.0000   Min.   :-1.00000  
##  1st Qu.:-1.0000   1st Qu.: 0.00000  
##  Median :-1.0000   Median : 0.00000  
##  Mean   :-0.8648   Mean   :-0.01561  
##  3rd Qu.:-1.0000   3rd Qu.: 0.00000  
##  Max.   : 1.0000   Max.   : 1.00000

Decision tree 5

tree5 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumWorkers2, data=acs_fit, 
               method="class", control=rpart.control(minsplit=2, cp=.0025))

rpart.plot(tree5)

##head(predict(tree5))
##head(predict(tree5, type="class"))

tree5_cm <- confusionMatrix(predict(tree5, type="class"), acs_fit$HighIncome, positive = "1")
tree5_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16972   920
##          1   107   197
##                                           
##                Accuracy : 0.9436          
##                  95% CI : (0.9401, 0.9469)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.002593        
##                                           
##                   Kappa : 0.2578          
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.17637         
##             Specificity : 0.99373         
##          Pos Pred Value : 0.64803         
##          Neg Pred Value : 0.94858         
##              Prevalence : 0.06139         
##          Detection Rate : 0.01083         
##    Detection Prevalence : 0.01671         
##       Balanced Accuracy : 0.58505         
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(tree5,newdata=acs_test))
##        0                 1            
##  Min.   :-1.0000   Min.   :-0.789474  
##  1st Qu.:-0.9580   1st Qu.:-0.041991  
##  Median :-0.9580   Median :-0.041991  
##  Mean   :-0.8780   Mean   :-0.002413  
##  3rd Qu.:-0.9580   3rd Qu.:-0.041991  
##  Max.   : 0.7895   Max.   : 0.958009

“Random” Bagged Forests

Specify model and view

rand_bag1 <- randomForest(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms , data=acs_test, mtry=4,
                          importance = TRUE, na.action = na.omit)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rand_bag1
## 
## Call:
##  randomForest(formula = HighIncome ~ Insurance + ElectricBill +      HouseCosts + NumBedrooms, data = acs_test, mtry = 4, importance = TRUE,      na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.05218558
##                     % Var explained: 7.17
# plot model
plot(rand_bag1)

#### Paper or plastic? Time to bag using fit data

  1. Predict using model on fit data set
  2. Display tail of predictions for visual inspection
  3. Transform prediction probability to binary 1 or 0 based on criteria prob > .5
  4. Create confusion matrix
  5. Display confusion matrix
  6. Residual analysis
bag1_fit <- predict(rand_bag1, acs_fit, type="class" )

tail(bag1_fit)
##         22739         22740         22741         22742         22743 
##  1.846667e-02  6.073333e-02  1.120000e-02  2.377667e-01 -8.773537e-17 
##         22744 
## -8.827661e-17
acs_fit$HighIncome_bag <- ifelse(bag1_fit > .5,1,0)

bag_cm1_fit <- confusionMatrix(acs_fit$HighIncome_bag, acs_fit$HighIncome, positive = "1")

bag_cm1_fit
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16862   930
##          1   217   187
##                                           
##                Accuracy : 0.937           
##                  95% CI : (0.9333, 0.9405)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.827           
##                                           
##                   Kappa : 0.2205          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.16741         
##             Specificity : 0.98729         
##          Pos Pred Value : 0.46287         
##          Neg Pred Value : 0.94773         
##              Prevalence : 0.06139         
##          Detection Rate : 0.01028         
##    Detection Prevalence : 0.02220         
##       Balanced Accuracy : 0.57735         
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(rand_bag1,newdata=acs_fit))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.975100 -0.066800 -0.007067 -0.009178  0.000000  1.000000

Predict using model on test data

  1. Make predictions
  2. Transform prediction probability to binary 1 or 0 based on criteria prob > .5
  3. Create confusion matrix
  4. Display confusion matrix
  5. Residual analysis
bag1_pred <- predict(rand_bag1, acs_test, type="class" )

acs_test$HighIncome_bag <- ifelse(bag1_pred > .5,1,0)

bag_cm1_pred <- confusionMatrix(acs_test$HighIncome_bag, acs_test$HighIncome, positive = "1")

bag_cm1_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4277   84
##          1    0  188
##                                           
##                Accuracy : 0.9815          
##                  95% CI : (0.9772, 0.9852)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.808           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.69118         
##             Specificity : 1.00000         
##          Pos Pred Value : 1.00000         
##          Neg Pred Value : 0.98074         
##              Prevalence : 0.05979         
##          Detection Rate : 0.04133         
##    Detection Prevalence : 0.04133         
##       Balanced Accuracy : 0.84559         
##                                           
##        'Positive' Class : 1               
## 
# Residual analysis
summary(acs_test$HighIncome - predict(rand_bag1,newdata=acs_test))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.483500 -0.027030 -0.002400 -0.003444  0.000000  0.789600

Random Forest

Specify model and view

rand_forest <- randomForest(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms + ElectricBill + 
                            NumVehicles + NumChildren + NumPeople + FoodStamp + Language + NumRooms + OwnRent + 
                            NumUnits, data=acs_test, mtry=12, importance = TRUE, na.action = na.omit)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

Predict on fit data

rf_fit <- predict(rand_forest, acs_fit, type="class" )

acs_fit$HighIncome_rf_fit <- ifelse(rf_fit > .5,1,0)

rf_cm_fit <- confusionMatrix(acs_fit$HighIncome_rf_fit, acs_fit$HighIncome, positive = "1")

rf_cm_fit
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 16889   930
##          1   190   187
##                                           
##                Accuracy : 0.9384          
##                  95% CI : (0.9349, 0.9419)
##     No Information Rate : 0.9386          
##     P-Value [Acc > NIR] : 0.5448          
##                                           
##                   Kappa : 0.2264          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.16741         
##             Specificity : 0.98888         
##          Pos Pred Value : 0.49602         
##          Neg Pred Value : 0.94781         
##              Prevalence : 0.06139         
##          Detection Rate : 0.01028         
##    Detection Prevalence : 0.02072         
##       Balanced Accuracy : 0.57814         
##                                           
##        'Positive' Class : 1               
## 

Test Model on acs_test to see how well it predicts

rf_pred <- predict(rand_forest, acs_test, type="class" )

acs_test$HighIncome_rf <- ifelse(rf_pred > .5,1,0)

rf_cm_pred <- confusionMatrix(acs_test$HighIncome_rf, acs_test$HighIncome, positive = "1")

rf_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4277   44
##          1    0  228
##                                         
##                Accuracy : 0.9903        
##                  95% CI : (0.987, 0.993)
##     No Information Rate : 0.9402        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.9069        
##  Mcnemar's Test P-Value : 9.022e-11     
##                                         
##             Sensitivity : 0.83824       
##             Specificity : 1.00000       
##          Pos Pred Value : 1.00000       
##          Neg Pred Value : 0.98982       
##              Prevalence : 0.05979       
##          Detection Rate : 0.05012       
##    Detection Prevalence : 0.05012       
##       Balanced Accuracy : 0.91912       
##                                         
##        'Positive' Class : 1             
## 
# Residual analysis
summary(acs_test$HighIncome - predict(rand_forest,newdata=acs_test))
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.388500 -0.025800 -0.002567 -0.003068  0.000000  0.623100

Tree predictions

  1. Make predictions using test data
  2. Confusion matrix
  3. Display all models for comparison
# make predictions using test data
tree1_pred <- predict(tree1, acs_test, type="class" )
tree2_pred <- predict(tree2, acs_test, type="class" ) 
tree3_pred <- predict(tree3, acs_test, type="class" ) 
tree4_pred <- predict(tree4, acs_test, type="class" )
tree5_pred <- predict(tree5, acs_test, type="class" )

# Confusion matrices
tree_cm1_pred <- confusionMatrix(tree1_pred, acs_test$HighIncome, positive = "1")
tree_cm2_pred <- confusionMatrix(tree2_pred, acs_test$HighIncome, positive = "1")
tree_cm3_pred <- confusionMatrix(tree3_pred, acs_test$HighIncome, positive = "1")
tree_cm4_pred <- confusionMatrix(tree4_pred, acs_test$HighIncome, positive = "1")
tree_cm5_pred <- confusionMatrix(tree5_pred, acs_test$HighIncome, positive = "1")

K-nearest neighbor

  1. k-nearest neighbor can only take numerical data
  2. It is recommended (in most all cases) to normalize the data set

First Normalize the dataset

# function to normalize data
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
# create and normalize a new data frame for knn analysis
acs_numericals <- data.frame(acs$NumBedrooms, acs$NumChildren, acs$NumPeople, acs$NumRooms, acs$NumVehicles, acs$NumWorkers, acs$HouseCosts, acs$ElectricBill, acs$Insurance)
acs_norm <- as.data.frame(lapply(acs_numericals[1:8], normalize))
acs_norm$HighIncome1 <- c(acs$HighIncome)

Split data frame into learn and validate subsets

  1. Count nunber of rows
  2. Create index of random row numbers for validation set
  3. Create the learning and validate data sets
m <- nrow(acs_numericals)

val <- sample(1:m, size = round(m/3))

acsNorm_learn <- acs_norm[-val,]
acsNorm_valid <- acs_norm[val,]
# view new data frame to verify normalization
summary(acs_norm)
##  acs.NumBedrooms  acs.NumChildren  acs.NumPeople     acs.NumRooms   
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3750   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.2500  
##  Median :0.3750   Median :0.0000   Median :0.0625   Median :0.3000  
##  Mean   :0.4232   Mean   :0.0751   Mean   :0.0869   Mean   :0.3087  
##  3rd Qu.:0.5000   3rd Qu.:0.1667   3rd Qu.:0.1250   3rd Qu.:0.3500  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  acs.NumVehicles  acs.NumWorkers   acs.HouseCosts    acs.ElectricBill
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.3333   1st Qu.:0.3333   1st Qu.:0.09117   1st Qu.:0.1710  
##  Median :0.3333   Median :0.6667   Median :0.16878   Median :0.2573  
##  Mean   :0.3521   Mean   :0.5815   Mean   :0.20836   Mean   :0.3006  
##  3rd Qu.:0.5000   3rd Qu.:0.6667   3rd Qu.:0.28168   3rd Qu.:0.3782  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##   HighIncome1     
##  Min.   :0.00000  
##  1st Qu.:0.00000  
##  Median :0.00000  
##  Mean   :0.06107  
##  3rd Qu.:0.00000  
##  Max.   :1.00000

knn method

  1. specify knn model
  2. create a visualization
  3. create a confusion matrix

knn 1

acs_knn1 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome1, k=5, prob = TRUE)
##head(acs_knn1)

pcol1 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol1, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn1)+1])

knn1_cm_pred <- confusionMatrix(acs_knn1, acsNorm_valid$HighIncome, positive = "1")
knn1_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7019  420
##          1   75   68
##                                           
##                Accuracy : 0.9347          
##                  95% CI : (0.9289, 0.9402)
##     No Information Rate : 0.9356          
##     P-Value [Acc > NIR] : 0.6394          
##                                           
##                   Kappa : 0.192           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.139344        
##             Specificity : 0.989428        
##          Pos Pred Value : 0.475524        
##          Neg Pred Value : 0.943541        
##              Prevalence : 0.064363        
##          Detection Rate : 0.008969        
##    Detection Prevalence : 0.018860        
##       Balanced Accuracy : 0.564386        
##                                           
##        'Positive' Class : 1               
## 

knn 2

acs_knn2 <- knn(acsNorm_learn[,1:4], acsNorm_valid[,1:4], acsNorm_learn$HighIncome, k=5, prob = TRUE)
##head(acs_knn2)

pcol2 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[2:5], pch = pcol2, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn2)+1])

knn2_cm_pred <- confusionMatrix(acs_knn2, acsNorm_valid$HighIncome, positive = "1")
knn2_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7067  477
##          1   27   11
##                                          
##                Accuracy : 0.9335         
##                  95% CI : (0.9277, 0.939)
##     No Information Rate : 0.9356         
##     P-Value [Acc > NIR] : 0.7808         
##                                          
##                   Kappa : 0.0328         
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.022541       
##             Specificity : 0.996194       
##          Pos Pred Value : 0.289474       
##          Neg Pred Value : 0.936771       
##              Prevalence : 0.064363       
##          Detection Rate : 0.001451       
##    Detection Prevalence : 0.005012       
##       Balanced Accuracy : 0.509367       
##                                          
##        'Positive' Class : 1              
## 

knn 3

acs_knn3 <- knn(acsNorm_learn[,6:8], acsNorm_valid[,6:8], acsNorm_learn$HighIncome, k=3, prob = TRUE)
##head(acs_knn3)

pcol3 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[6:8], pch = pcol3, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn3)+1])

knn3_cm_pred <- confusionMatrix(acs_knn3, acsNorm_valid$HighIncome, positive = "1")
knn3_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7014  421
##          1   80   67
##                                           
##                Accuracy : 0.9339          
##                  95% CI : (0.9281, 0.9394)
##     No Information Rate : 0.9356          
##     P-Value [Acc > NIR] : 0.7376          
##                                           
##                   Kappa : 0.1868          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.137295        
##             Specificity : 0.988723        
##          Pos Pred Value : 0.455782        
##          Neg Pred Value : 0.943376        
##              Prevalence : 0.064363        
##          Detection Rate : 0.008837        
##    Detection Prevalence : 0.019388        
##       Balanced Accuracy : 0.563009        
##                                           
##        'Positive' Class : 1               
## 

knn 4

acs_knn4 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=10, prob = TRUE)
##head(acs_knn4)

pcol4 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol4, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn4)+1])

knn4_cm_pred <- confusionMatrix(acs_knn4, acsNorm_valid$HighIncome, positive = "1")
knn4_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7050  435
##          1   44   53
##                                           
##                Accuracy : 0.9368          
##                  95% CI : (0.9311, 0.9422)
##     No Information Rate : 0.9356          
##     P-Value [Acc > NIR] : 0.3475          
##                                           
##                   Kappa : 0.1633          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.10861         
##             Specificity : 0.99380         
##          Pos Pred Value : 0.54639         
##          Neg Pred Value : 0.94188         
##              Prevalence : 0.06436         
##          Detection Rate : 0.00699         
##    Detection Prevalence : 0.01279         
##       Balanced Accuracy : 0.55120         
##                                           
##        'Positive' Class : 1               
## 

knn 5

acs_knn5 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=25, prob = TRUE)

#head(acs_knn5)

pcol5 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol5, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn5)+1])

knn5_cm_pred <- confusionMatrix(acs_knn5, acsNorm_valid$HighIncome, positive = "1")
knn5_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7071  430
##          1   23   58
##                                           
##                Accuracy : 0.9403          
##                  95% CI : (0.9347, 0.9455)
##     No Information Rate : 0.9356          
##     P-Value [Acc > NIR] : 0.05198         
##                                           
##                   Kappa : 0.189           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.11885         
##             Specificity : 0.99676         
##          Pos Pred Value : 0.71605         
##          Neg Pred Value : 0.94267         
##              Prevalence : 0.06436         
##          Detection Rate : 0.00765         
##    Detection Prevalence : 0.01068         
##       Balanced Accuracy : 0.55781         
##                                           
##        'Positive' Class : 1               
## 

knn 6

acs_knn6 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=50, prob = TRUE)
##head(acs_knn6)
pcol6 <- as.character(as.numeric(acsNorm_valid$HighIncome1))

pairs(acsNorm_valid[1:8], pch = pcol6, col = c("green3", "red")
  [(acsNorm_valid$HighIncome1 != acs_knn6)+1])

knn6_cm_pred <- confusionMatrix(acs_knn6, acsNorm_valid$HighIncome, positive = "1")
knn6_cm_pred
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7069  432
##          1   25   56
##                                          
##                Accuracy : 0.9397         
##                  95% CI : (0.9341, 0.945)
##     No Information Rate : 0.9356         
##     P-Value [Acc > NIR] : 0.07568        
##                                          
##                   Kappa : 0.1818         
##  Mcnemar's Test P-Value : < 2e-16        
##                                          
##             Sensitivity : 0.114754       
##             Specificity : 0.996476       
##          Pos Pred Value : 0.691358       
##          Neg Pred Value : 0.942408       
##              Prevalence : 0.064363       
##          Detection Rate : 0.007386       
##    Detection Prevalence : 0.010683       
##       Balanced Accuracy : 0.555615       
##                                          
##        'Positive' Class : 1              
## 

Cluster Analysis

Function to normalize data

normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}

First cluster attempt

  1. Create data frame
  2. Normalize data
  3. Calculate distances
  4. Plot cluster dendogram
  5. Cut the tree into k number of clusters
acs_numericals2 <- data.frame(acs$HouseCosts, acs$ElectricBill, acs$Insurance)
acs_norm2 <- as.data.frame(lapply(acs_numericals2[1:3], normalize))
acs_norm2$FamilyType1 <- c(acs$FamilyType)

distance <- dist(acs_norm2, method ="euclidean")
pfit <- hclust(distance, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
cluster1 <- plot(pfit, labels = acs_norm2$FamilyType)
rect.hclust(pfit, k=5)

acs_norm2$groups <- cutree(pfit, k=5)

Display each goup with number of observations per group

table(acs_norm2$groups)
## 
##    1    2    3    4    5 
## 7075 3266 1153 7641 3610
# Add additional variables ot the data set for model specification
acs_norm2$HighIncome <- c(acs$HighIncome)
# Break it into a training and test set with an 80/20 split.
set.seed(447)
testrecs <- sample(nrow(acs_norm2),0.2 * nrow(acs_norm2))
acs_norm2_test <- acs_norm2[testrecs,]
acs_norm2_fit <- acs_norm2[-testrecs,]  

Use groups as new variable in model

Model with clusters

  1. Specify model using GAM
  2. Show summary of model
  3. Set variable = r squared
  4. Set variable = AIC
  5. Set predictions to binomial based on criteria > .5
  6. Confusion Matrix
  7. Display confusion matrix, r squared, and AIC
gam_mod_cluster <- gam(HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) + groups, 
                data = acs_norm2_fit, family=binomial(link="logit"))

summary(gam_mod_cluster)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) + 
##     groups
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.47032    0.08509 -40.784   <2e-16 ***
## groups       0.03128    0.02440   1.282      0.2    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df Chi.sq p-value    
## s(acs.Insurance)    8.499  8.911 221.90 < 2e-16 ***
## s(acs.HouseCosts)   7.135  8.093 481.71 < 2e-16 ***
## s(acs.ElectricBill) 1.619  2.021  48.26 3.8e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.177   Deviance explained = 23.3%
## UBRE = -0.64375  Scale est. = 1         n = 18196
acs_norm2_test$gam_cluster_HighIncome <- predict(gam_mod_cluster, newdata=acs_norm2_test)

acs_norm2_test$gam_cluster_HighIncome <- ifelse(acs_norm2_test$gam_cluster_HighIncome > .5 ,1,0)

gam_cluster_cm <- confusionMatrix(as.factor(acs_norm2_test$gam_cluster_HighIncome), as.factor(acs_norm2_test$HighIncome),
                                  positive = "1")

gam_cluster_adjR <- summary(gam_mod_cluster)$r.sq

gam_cluster_aic <- AIC(gam_mod_cluster)

gam_cluster_cm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4258  247
##          1   19   25
##                                           
##                Accuracy : 0.9415          
##                  95% CI : (0.9343, 0.9482)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.3685          
##                                           
##                   Kappa : 0.144           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.091912        
##             Specificity : 0.995558        
##          Pos Pred Value : 0.568182        
##          Neg Pred Value : 0.945172        
##              Prevalence : 0.059793        
##          Detection Rate : 0.005496        
##    Detection Prevalence : 0.009672        
##       Balanced Accuracy : 0.543735        
##                                           
##        'Positive' Class : 1               
## 
gam_cluster_adjR
## [1] 0.1772829
gam_cluster_aic
## [1] 6482.354

Second cluster attempt

acs_numericals3 <- data.frame(acs$HouseCosts, acs$ElectricBill, acs$Insurance, acs$NumBedrooms, 
                              acs$NumChildren, acs$NumWorkers, acs$NumRooms)

acs_norm3 <- as.data.frame(lapply(acs_numericals3[1:7], normalize))

distance2 <- dist(acs_norm3, method ="euclidean")

pfit2 <- hclust(distance2, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
cluster2 <- plot(pfit2)

rect.hclust(pfit2, k=5)

acs_norm3$groups <- cutree(pfit2, k=15)

Display each goup with number of observations per group

table(acs_norm3$groups)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1606  306 2402  932 1460 1166 1550  617 2387 2754 1089 2855 1623 1723  275

Use groups as new variable in model

# Add additional variables ot the data set for model specification
acs_norm3$HighIncome <- c(acs$HighIncome)
acs_norm3$FamilyType <- c(acs$FamilyType)
acs_norm3$Foodstamp <- c(acs$FoodStamp)
acs_norm3$OwnRent <- c(acs$OwnRent)
# Break it into a training and test set with an 80/20 split.
set.seed(447)
testrecs <- sample(nrow(acs_norm3),0.2 * nrow(acs_norm3))
acs_norm3_test <- acs_norm3[testrecs,]
acs_norm3_fit <- acs_norm3[-testrecs,]  

Model with clusters

gam_mod_cluster2 <- gam(HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) + 
                        acs.NumBedrooms + s(acs.NumChildren) + acs.NumWorkers + s(acs.NumRooms) + 
                        FamilyType + Foodstamp + OwnRent + groups, data = acs_norm3_fit, 
                        family=binomial(link="logit"))

summary(gam_mod_cluster2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) + 
##     acs.NumBedrooms + s(acs.NumChildren) + acs.NumWorkers + s(acs.NumRooms) + 
##     FamilyType + Foodstamp + OwnRent + groups
## 
## Parametric coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.59167    0.52466  -8.752  < 2e-16 ***
## acs.NumBedrooms  0.30668    0.28764   1.066   0.2863    
## acs.NumWorkers  -0.36105    0.20503  -1.761   0.0783 .  
## FamilyType       0.77413    0.09801   7.899 2.82e-15 ***
## Foodstamp       -1.64131    0.36794  -4.461 8.17e-06 ***
## OwnRent          0.06540    0.12115   0.540   0.5893    
## groups           0.07053    0.01507   4.679 2.88e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df  Chi.sq  p-value    
## s(acs.Insurance)    8.529  8.921 212.986  < 2e-16 ***
## s(acs.HouseCosts)   6.567  7.595 288.269  < 2e-16 ***
## s(acs.ElectricBill) 1.487  1.833  35.341 2.79e-08 ***
## s(acs.NumChildren)  3.022  3.700   9.904   0.0449 *  
## s(acs.NumRooms)     2.896  3.530 108.486  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.21   Deviance explained = 27.8%
## UBRE = -0.66332  Scale est. = 1         n = 18196
acs_norm3_test$gam_cluster_HighIncome <- predict(gam_mod_cluster2, newdata=acs_norm3_test)

acs_norm3_test$gam_cluster_HighIncome <- ifelse(acs_norm3_test$gam_cluster_HighIncome > .5 ,1,0)

gam_cluster_cm2 <- confusionMatrix(as.factor(acs_norm3_test$gam_cluster_HighIncome), as.factor(acs_norm3_test$HighIncome),
                                  positive = "1")

gam_cluster2_adjR <- summary(gam_mod_cluster2)$r.sq

gam_cluster2_aic <- AIC(gam_mod_cluster2)

gam_cluster_cm2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 4258  237
##          1   19   35
##                                           
##                Accuracy : 0.9437          
##                  95% CI : (0.9366, 0.9502)
##     No Information Rate : 0.9402          
##     P-Value [Acc > NIR] : 0.1663          
##                                           
##                   Kappa : 0.1989          
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.128676        
##             Specificity : 0.995558        
##          Pos Pred Value : 0.648148        
##          Neg Pred Value : 0.947275        
##              Prevalence : 0.059793        
##          Detection Rate : 0.007694        
##    Detection Prevalence : 0.011871        
##       Balanced Accuracy : 0.562117        
##                                           
##        'Positive' Class : 1               
## 
gam_cluster2_adjR
## [1] 0.2100819
gam_cluster2_aic
## [1] 6126.215

Model Comparison

Display important metrics for each model

sprintf("LOGISTIC REGRESSION")
## [1] "LOGISTIC REGRESSION"
sprintf("Logistic model 1: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm1$overall['Accuracy'], log_cm1$byClass['Sensitivity'], log_mod1_aic)
## [1] "Logistic model 1: Predicted Accuracy = 0.6324   Predicted Sensitivity = 0.790   AIC = 7118.0"
sprintf("Logistic model 2: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm2$overall['Accuracy'], log_cm2$byClass['Sensitivity'], log_mod2_aic)
## [1] "Logistic model 2: Predicted Accuracy = 0.3317   Predicted Sensitivity = 0.897   AIC = 8049.1"
sprintf("Logistic model 3: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm3$overall['Accuracy'], log_cm3$byClass['Sensitivity'], log_mod3_aic)
## [1] "Logistic model 3: Predicted Accuracy = 0.7232   Predicted Sensitivity = 0.735   AIC = 7319.5"
sprintf("Logistic model 4: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm4$overall['Accuracy'], log_cm4$byClass['Sensitivity'], log_mod4_aic)
## [1] "Logistic model 4: Predicted Accuracy = 0.7191   Predicted Sensitivity = 0.732   AIC = 7412.8"
sprintf("Logistic model 5: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm5$overall['Accuracy'], log_cm5$byClass['Sensitivity'], log_mod5_aic)
## [1] "Logistic model 5: Predicted Accuracy = 0.7402   Predicted Sensitivity = 0.816   AIC = 6188.7"
sprintf("Logistic model 6: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm6$overall['Accuracy'], log_cm6$byClass['Sensitivity'], log_mod6_aic)
## [1] "Logistic model 6: Predicted Accuracy = 0.7270   Predicted Sensitivity = 0.798   AIC = 6374.8"
sprintf("Logistic model 6: Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        log_cm6$overall['Accuracy'], log_cm6$byClass['Sensitivity'], log_mod6_aic)
## [1] "Logistic model 6: Predicted Accuracy = 0.7270   Predicted Sensitivity = 0.798   AIC = 6374.8"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("LINEAR REGRESSION")
## [1] "LINEAR REGRESSION"
sprintf("Linear model 1:   Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   Adj R-squared = %.3f", 
        linear_cm1$overall['Accuracy'], linear_cm1$byClass['Sensitivity'], linear_mod1_rsq)
## [1] "Linear model 1:   Predicted Accuracy = 0.9444   Predicted Sensitivity = 0.243   Adj R-squared = 0.314"
sprintf("Linear model 2:   Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   Adj R-squared = %.3f", 
        linear_cm2$overall['Accuracy'], linear_cm2$byClass['Sensitivity'], linear_mod2_rsq)
## [1] "Linear model 2:   Predicted Accuracy = 0.9435   Predicted Sensitivity = 0.239   Adj R-squared = 0.341"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("SUPPORT VECTOR MACHINES")
## [1] "SUPPORT VECTOR MACHINES"
sprintf("SVM model 1:      Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f", 
        svm_cm1$overall['Accuracy'], svm_cm1$byClass['Sensitivity'])
## [1] "SVM model 1:      Predicted Accuracy = 0.9453   Predicted Sensitivity = 0.151"
sprintf("SVM model 2:      Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f", 
        svm_cm2$overall['Accuracy'], svm_cm2$byClass['Sensitivity'])
## [1] "SVM model 2:      Predicted Accuracy = 0.9455   Predicted Sensitivity = 0.180"
sprintf("SVM model 3:      Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f", 
        svm_cm3$overall['Accuracy'], svm_cm3$byClass['Sensitivity'])
## [1] "SVM model 3:      Predicted Accuracy = 0.9466   Predicted Sensitivity = 0.176"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("MULTILEVEL MODEL SPECIFICATION")
## [1] "MULTILEVEL MODEL SPECIFICATION"
sprintf("MMS model 1:      Predicted Accuracy = NA        Predicted Sensitivity = NA    AIC = %.1f", mms1_aic)
## [1] "MMS model 1:      Predicted Accuracy = NA        Predicted Sensitivity = NA    AIC = -3398.4"
sprintf("MMS model 2:      Predicted Accuracy = NA        Predicted Sensitivity = NA    AIC = %.1f", mms2_aic)
## [1] "MMS model 2:      Predicted Accuracy = NA        Predicted Sensitivity = NA    AIC = -462.3"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("GENERALIZED ADDITIVE MODELS")
## [1] "GENERALIZED ADDITIVE MODELS"
sprintf("GAM model 1:      Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        gam_cm1$overall['Accuracy'], gam_cm1$byClass['Sensitivity'], gam_mod_aic)
## [1] "GAM model 1:      Predicted Accuracy = 0.9398   Predicted Sensitivity = 0.243   AIC = 463153.9"
sprintf("GAM model 2:      Predicted Accuracy = %.4f   Predicted Sensitivity = %.3f   AIC = %.1f", 
        gam_cm2$overall['Accuracy'], gam_cm2$byClass['Sensitivity'], gam_mod2_aic)
## [1] "GAM model 2:      Predicted Accuracy = 0.9435   Predicted Sensitivity = 0.121   AIC = 6075.4"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("DECISION TREES")
## [1] "DECISION TREES"
sprintf("Tree 1: Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",tree1_cm$overall['Accuracy'], 
        tree_cm1_pred$overall['Accuracy'], tree_cm1_pred$byClass['Sensitivity'])
## [1] "Tree 1: Fit Accuracy = 0.9414   Predicted Accuracy = 0.9422   Predicted Sensitivity = 0.2169"
sprintf("Tree 2: Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",tree2_cm$overall['Accuracy'], 
        tree_cm2_pred$overall['Accuracy'], tree_cm2_pred$byClass['Sensitivity'])
## [1] "Tree 2: Fit Accuracy = 0.9394   Predicted Accuracy = 0.9391   Predicted Sensitivity = 0.0037"
sprintf("Tree 3: Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",tree3_cm$overall['Accuracy'], 
        tree_cm3_pred$overall['Accuracy'], tree_cm3_pred$byClass['Sensitivity'])
## [1] "Tree 3: Fit Accuracy = 0.9426   Predicted Accuracy = 0.9426   Predicted Sensitivity = 0.1471"
sprintf("Tree 4: Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",tree4_cm$overall['Accuracy'], 
        tree_cm4_pred$overall['Accuracy'], tree_cm4_pred$byClass['Sensitivity'])
## [1] "Tree 4: Fit Accuracy = 0.9999   Predicted Accuracy = 0.8965   Predicted Sensitivity = 0.2647"
sprintf("Tree 5: Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",tree5_cm$overall['Accuracy'], 
        tree_cm5_pred$overall['Accuracy'], tree_cm5_pred$byClass['Sensitivity'])
## [1] "Tree 5: Fit Accuracy = 0.9436   Predicted Accuracy = 0.9424   Predicted Sensitivity = 0.1654"
sprintf("Bag 1:  Fit Accuracy = %.4f   Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f",bag_cm1_fit$overall['Accuracy'], 
        bag_cm1_pred$overall['Accuracy'], bag_cm1_pred$byClass['Sensitivity'])
## [1] "Bag 1:  Fit Accuracy = 0.9370   Predicted Accuracy = 0.9815   Predicted Sensitivity = 0.6912"
sprintf("RF:    Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",rf_cm_fit$overall['Accuracy'], 
        rf_cm_pred$overall['Accuracy'], rf_cm_pred$byClass['Sensitivity'])
## [1] "RF:    Fit Accuracy = 0.9384 Predicted Accuracy = 0.9903 Predicted Sensitivity = 0.8382"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("K NEAREST NEIGHBOR")
## [1] "K NEAREST NEIGHBOR"
sprintf("Knn 1:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn1_cm_pred$overall['Accuracy'], 
        knn1_cm_pred$byClass['Sensitivity'])
## [1] "Knn 1:            Predicted Accuracy = 0.9347   Predicted Sensitivity = 0.1393"
sprintf("Knn 2:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn2_cm_pred$overall['Accuracy'], 
        knn2_cm_pred$byClass['Sensitivity'])
## [1] "Knn 2:            Predicted Accuracy = 0.9335   Predicted Sensitivity = 0.0225"
sprintf("Knn 3:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn3_cm_pred$overall['Accuracy'], 
        knn3_cm_pred$byClass['Sensitivity'])
## [1] "Knn 3:            Predicted Accuracy = 0.9339   Predicted Sensitivity = 0.1373"
sprintf("Knn 4:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn4_cm_pred$overall['Accuracy'], 
        knn4_cm_pred$byClass['Sensitivity'])
## [1] "Knn 4:            Predicted Accuracy = 0.9368   Predicted Sensitivity = 0.1086"
sprintf("Knn 5:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn5_cm_pred$overall['Accuracy'], 
        knn5_cm_pred$byClass['Sensitivity'])
## [1] "Knn 5:            Predicted Accuracy = 0.9403   Predicted Sensitivity = 0.1189"
sprintf("Knn 6:            Predicted Accuracy = %.4f   Predicted Sensitivity = %.4f", knn6_cm_pred$overall['Accuracy'], 
        knn6_cm_pred$byClass['Sensitivity'])
## [1] "Knn 6:            Predicted Accuracy = 0.9397   Predicted Sensitivity = 0.1148"
sprintf("                                                                                                      ")
## [1] "                                                                                                      "
sprintf("CLUSTER ANALYSIS USED IN MODELS")
## [1] "CLUSTER ANALYSIS USED IN MODELS"
sprintf("GAM cluster 1:    Predicted Accuracy = %.3f    Predicted Sensitivity = %.3f   AIC = %.1f", 
        gam_cluster_cm$overall['Accuracy'], gam_cluster_cm$byClass['Sensitivity'], gam_cluster_aic)
## [1] "GAM cluster 1:    Predicted Accuracy = 0.942    Predicted Sensitivity = 0.092   AIC = 6482.4"
sprintf("GAM cluster 2:    Predicted Accuracy = %.3f    Predicted Sensitivity = %.3f   AIC = %.1f", 
        gam_cluster_cm2$overall['Accuracy'], gam_cluster_cm2$byClass['Sensitivity'], gam_cluster2_aic)
## [1] "GAM cluster 2:    Predicted Accuracy = 0.944    Predicted Sensitivity = 0.129   AIC = 6126.2"

BEST MODEL = Random Forest

Prediction accuracy > .9903

Prediction Sensitivity = .8382